Document Processing Examples¶

This guide demonstrates various document processing workflows using Dataknobs.

Basic Document Creation¶

Creating a Simple Document¶

from dataknobs_structures import Document

# Create a document with text content
doc = Document(
    text="This is a sample document containing important information.",
    metadata={
        "title": "Sample Document",
        "author": "John Doe",
        "created": "2024-01-01"
    }
)

print(f"Document ID: {doc.doc_id}")
print(f"Text: {doc.text}")
print(f"Metadata: {doc.metadata}")

Document with Sections¶

from dataknobs_structures import Document

# Create a structured document
sections = [
    {"title": "Introduction", "content": "This is the introduction."},
    {"title": "Main Body", "content": "This is the main content."},
    {"title": "Conclusion", "content": "This is the conclusion."}
]

doc = Document(
    text="\n\n".join([s["content"] for s in sections]),
    metadata={"sections": sections}
)

File-based Document Processing¶

Processing Text Files¶

from dataknobs_utils import file_utils
from dataknobs_structures import Document

def process_text_file(filepath):
    """Process a text file into a Document."""
    # Read file content
    content = file_utils.read_file(filepath)

    # Create document
    doc = Document(
        text=content,
        metadata={
            "source": filepath,
            "size": len(content),
            "lines": content.count('\n') + 1
        }
    )

    return doc

# Process a file
doc = process_text_file("data/sample.txt")

Batch Document Processing¶

from dataknobs_utils import file_utils
from dataknobs_structures import Document
import os

def process_directory(directory_path):
    """Process all text files in a directory."""
    documents = []

    for filepath in file_utils.filepath_generator(directory_path):
        if filepath.endswith('.txt'):
            content = file_utils.read_file(filepath)
            doc = Document(
                text=content,
                metadata={
                    "source": filepath,
                    "filename": os.path.basename(filepath)
                }
            )
            documents.append(doc)

    return documents

# Process all documents
docs = process_directory("data/documents/")
print(f"Processed {len(docs)} documents")

Document Transformation¶

Text Normalization¶

from dataknobs_structures import Document
from dataknobs_xization import normalize

def normalize_document(doc):
    """Normalize document text."""
    # Apply various normalizations
    normalized_text = doc.text

    # Expand camelCase
    normalized_text = normalize.expand_camelcase_fn(normalized_text)

    # Expand ampersands
    normalized_text = normalize.expand_ampersand_fn(normalized_text)

    # Remove extra whitespace
    normalized_text = normalize.normalize_whitespace_fn(normalized_text)

    # Create new document with normalized text
    return Document(
        text=normalized_text,
        metadata={**doc.metadata, "normalized": True}
    )

# Example usage
original = Document("getUserName&validateInput", metadata={"type": "code"})
normalized = normalize_document(original)
print(f"Original: {original.text}")
print(f"Normalized: {normalized.text}")

Document Chunking¶

from dataknobs_structures import Document

def chunk_document(doc, chunk_size=1000, overlap=100):
    """Split document into overlapping chunks."""
    text = doc.text
    chunks = []

    for i in range(0, len(text), chunk_size - overlap):
        chunk_text = text[i:i + chunk_size]
        chunk_doc = Document(
            text=chunk_text,
            metadata={
                **doc.metadata,
                "chunk_index": len(chunks),
                "chunk_start": i,
                "chunk_end": min(i + chunk_size, len(text))
            }
        )
        chunks.append(chunk_doc)

    return chunks

# Split a large document
large_doc = Document("A" * 5000, metadata={"title": "Large Document"})
chunks = chunk_document(large_doc)
print(f"Created {len(chunks)} chunks")

Document Analysis¶

Extracting Statistics¶

from dataknobs_structures import Document
import re

def analyze_document(doc):
    """Extract statistics from a document."""
    text = doc.text

    stats = {
        "character_count": len(text),
        "word_count": len(text.split()),
        "line_count": text.count('\n') + 1,
        "sentence_count": len(re.split(r'[.!?]+', text)),
        "paragraph_count": len(re.split(r'\n\n+', text)),
        "unique_words": len(set(text.lower().split())),
        "average_word_length": sum(len(word) for word in text.split()) / max(len(text.split()), 1)
    }

    return stats

# Analyze a document
doc = Document("""
This is a sample document. It has multiple sentences!
And even multiple paragraphs?

This is the second paragraph.
""")

stats = analyze_document(doc)
for key, value in stats.items():
    print(f"{key}: {value}")

Keyword Extraction¶

from dataknobs_structures import Document
from collections import Counter
import re

def extract_keywords(doc, num_keywords=10):
    """Extract top keywords from a document."""
    # Simple keyword extraction based on frequency
    text = doc.text.lower()

    # Remove punctuation and split into words
    words = re.findall(r'\b[a-z]+\b', text)

    # Filter out common stop words (simplified)
    stop_words = {'the', 'is', 'at', 'which', 'on', 'a', 'an', 'and', 'or', 'but'}
    words = [w for w in words if w not in stop_words and len(w) > 2]

    # Count frequencies
    word_counts = Counter(words)

    # Return top keywords
    return word_counts.most_common(num_keywords)

# Extract keywords
doc = Document("""
Machine learning is a subset of artificial intelligence. 
Machine learning algorithms build a model based on training data.
The model can make predictions or decisions without being explicitly programmed.
""")

keywords = extract_keywords(doc, 5)
print("Top keywords:")
for word, count in keywords:
    print(f"  {word}: {count}")

Document Storage and Retrieval¶

Using RecordStore¶

from dataknobs_structures import Document, RecordStore

# Create a record store for documents
doc_store = RecordStore()

# Store documents
doc1 = Document("First document", metadata={"id": "doc1"})
doc2 = Document("Second document", metadata={"id": "doc2"})

doc_store.add("doc1", doc1)
doc_store.add("doc2", doc2)

# Retrieve documents
retrieved = doc_store.get("doc1")
print(f"Retrieved: {retrieved.text}")

# List all documents
all_docs = doc_store.get_all()
print(f"Total documents: {len(all_docs)}")

Document Serialization¶

from dataknobs_structures import Document
from dataknobs_utils import json_utils
import json

def serialize_document(doc):
    """Serialize a document to JSON."""
    doc_dict = {
        "text": doc.text,
        "metadata": doc.metadata,
        "doc_id": doc.doc_id
    }
    return json.dumps(doc_dict, indent=2)

def deserialize_document(json_str):
    """Deserialize a document from JSON."""
    doc_dict = json.loads(json_str)
    return Document(
        text=doc_dict["text"],
        metadata=doc_dict.get("metadata", {})
    )

# Example usage
original = Document("Sample text", metadata={"author": "Jane"})
serialized = serialize_document(original)
print(f"Serialized:\n{serialized}")

deserialized = deserialize_document(serialized)
print(f"\nDeserialized text: {deserialized.text}")

Advanced Document Processing¶

Document Pipeline¶

from dataknobs_structures import Document
from dataknobs_xization import normalize
from dataknobs_utils import file_utils

class DocumentPipeline:
    """A configurable document processing pipeline."""

    def __init__(self):
        self.processors = []

    def add_processor(self, processor_func):
        """Add a processing step."""
        self.processors.append(processor_func)
        return self

    def process(self, doc):
        """Process document through all steps."""
        result = doc
        for processor in self.processors:
            result = processor(result)
        return result

# Define processing functions
def normalize_text(doc):
    """Normalize document text."""
    return Document(
        normalize.basic_normalization_fn(doc.text),
        metadata={**doc.metadata, "normalized": True}
    )

def add_statistics(doc):
    """Add text statistics to metadata."""
    doc.metadata["stats"] = {
        "words": len(doc.text.split()),
        "chars": len(doc.text)
    }
    return doc

def add_hash(doc):
    """Add content hash to metadata."""
    import hashlib
    doc.metadata["hash"] = hashlib.md5(doc.text.encode()).hexdigest()
    return doc

# Create and use pipeline
pipeline = DocumentPipeline()
pipeline.add_processor(normalize_text)
pipeline.add_processor(add_statistics)
pipeline.add_processor(add_hash)

# Process a document
input_doc = Document("ProcessThisText&CalculateStats")
output_doc = pipeline.process(input_doc)

print(f"Original: {input_doc.text}")
print(f"Processed: {output_doc.text}")
print(f"Metadata: {output_doc.metadata}")

Document Comparison¶

from dataknobs_structures import Document
import difflib

def compare_documents(doc1, doc2):
    """Compare two documents and find differences."""
    # Simple text similarity
    similarity = difflib.SequenceMatcher(
        None, doc1.text, doc2.text
    ).ratio()

    # Find differences
    differ = difflib.unified_diff(
        doc1.text.splitlines(),
        doc2.text.splitlines(),
        lineterm='',
        fromfile='doc1',
        tofile='doc2'
    )

    return {
        "similarity": similarity,
        "differences": list(differ),
        "doc1_unique_words": set(doc1.text.split()) - set(doc2.text.split()),
        "doc2_unique_words": set(doc2.text.split()) - set(doc1.text.split())
    }

# Compare documents
doc1 = Document("The quick brown fox jumps over the lazy dog")
doc2 = Document("The quick brown fox leaps over a lazy cat")

comparison = compare_documents(doc1, doc2)
print(f"Similarity: {comparison['similarity']:.2%}")
print(f"Doc1 unique words: {comparison['doc1_unique_words']}")
print(f"Doc2 unique words: {comparison['doc2_unique_words']}")

Integration Examples¶

Document to Tree Structure¶

from dataknobs_structures import Document, Tree

def document_to_tree(doc):
    """Convert a document to a tree structure."""
    # Create root node from document
    root = Tree(doc)

    # If document has sections, create child nodes
    if "sections" in doc.metadata:
        for section in doc.metadata["sections"]:
            section_doc = Document(
                section["content"],
                metadata={"title": section["title"]}
            )
            root.add_child(section_doc)

    return root

# Create structured document
doc = Document(
    "Main document content",
    metadata={
        "title": "Main",
        "sections": [
            {"title": "Intro", "content": "Introduction text"},
            {"title": "Body", "content": "Body text"},
            {"title": "Conclusion", "content": "Conclusion text"}
        ]
    }
)

# Convert to tree
tree = document_to_tree(doc)
print(f"Root: {tree.data.metadata['title']}")
for child in tree.children:
    print(f"  Child: {child.data.metadata['title']}")

Best Practices¶

Always include metadata: Track source, creation time, and processing steps
Handle encoding properly: Use UTF-8 for text files
Validate input: Check document content before processing
Use appropriate chunk sizes: Balance between processing efficiency and accuracy
Implement error handling: Gracefully handle malformed documents

Document Processing Examples¶

Basic Document Creation¶

Creating a Simple Document¶

Document with Sections¶

File-based Document Processing¶

Processing Text Files¶

Batch Document Processing¶

Document Transformation¶

Text Normalization¶

Document Chunking¶

Document Analysis¶

Extracting Statistics¶

Keyword Extraction¶

Document Storage and Retrieval¶

Using RecordStore¶

Document Serialization¶

Advanced Document Processing¶

Document Pipeline¶

Document Comparison¶

Integration Examples¶

Document to Tree Structure¶

Best Practices¶

Related Examples¶