Skip to main content

About the Example

This example demonstrates a complete document processing pipeline using the Unified OCR system. It processes all PDF documents in a directory, extracts text with preprocessing, saves results and metadata, and generates a processing summary with metrics.

Unified OCR Configuration

from upsonic.ocr import OCR
from upsonic.ocr.easyocr import EasyOCR

# Configure OCR with preprocessing for best results
ocr = OCR(
    EasyOCR,
    languages=['en'],
    confidence_threshold=0.6,
    rotation_fix=True,
    enhance_contrast=True,
    remove_noise=True,
    pdf_dpi=250,
    gpu=True
)

Full Code

from upsonic.ocr import OCR
from upsonic.ocr.easyocr import EasyOCR
from upsonic.ocr.exceptions import OCRError
from pathlib import Path
import json

def process_documents(directory: str, output_dir: str):
    """Process all PDF documents in a directory."""

    # Create OCR instance with optimal configuration
    ocr = OCR(
        EasyOCR,
        languages=['en'],
        confidence_threshold=0.6,
        rotation_fix=True,
        enhance_contrast=True,
        remove_noise=True,
        pdf_dpi=250,
        gpu=True
    )

    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Process each PDF
    input_path = Path(directory)
    for pdf_file in input_path.glob('*.pdf'):
        try:
            print(f"Processing {pdf_file.name}...")

            # Extract text with detailed results
            result = ocr.process_file(pdf_file)

            # Save extracted text
            text_file = output_path / f"{pdf_file.stem}.txt"
            text_file.write_text(result.text)

            # Save metadata
            metadata_file = output_path / f"{pdf_file.stem}_metadata.json"
            metadata_file.write_text(json.dumps(result.to_dict(), indent=2))

            # Log results
            print(f"  ✓ Extracted {len(result.text)} characters")
            print(f"  ✓ Confidence: {result.confidence:.2%}")
            print(f"  ✓ Pages: {result.page_count}")
            print(f"  ✓ Time: {result.processing_time_ms:.0f}ms")
            print(f"  ✓ Blocks: {len(result.blocks)}")

        except OCRError as e:
            print(f"  ✗ Error: {e}")
            continue

    # Print summary
    metrics = ocr.get_metrics()
    print(f"\n=== Summary ===")
    print(f"Files processed: {metrics.files_processed}")
    print(f"Total pages: {metrics.total_pages}")
    print(f"Total characters: {metrics.total_characters}")
    print(f"Average confidence: {metrics.average_confidence:.2%}")
    print(f"Total time: {metrics.processing_time_ms / 1000:.2f}s")

if __name__ == "__main__":
    process_documents('input_pdfs', 'output_text')

Multi-Language Document Processing

Process documents containing multiple languages using EasyOCR’s multi-language support.
from upsonic.ocr import OCR
from upsonic.ocr.easyocr import EasyOCR

# Create multi-language OCR
ocr = OCR(
    EasyOCR,
    languages=['en', 'zh', 'ja', 'ko'],
    gpu=True,
    confidence_threshold=0.5
)

# Process mixed-language document
result = ocr.process_file('multilingual_doc.pdf')

# Analyze results
print(f"Extracted text:\n{result.text}\n")
print(f"Overall confidence: {result.confidence:.2%}")

# Check per-block confidence
low_confidence_blocks = [
    block for block in result.blocks
    if block.confidence < 0.6
]
print(f"Low confidence blocks: {len(low_confidence_blocks)}")

# Show detailed block analysis
for i, block in enumerate(result.blocks[:5], 1):
    print(f"\nBlock {i}:")
    print(f"  Text: {block.text[:50]}...")
    print(f"  Confidence: {block.confidence:.2%}")
    if block.bbox:
        print(f"  Position: ({block.bbox.x:.0f}, {block.bbox.y:.0f})")

Invoice Data Extraction with PaddleOCR

Extract structured information from invoices using PPChatOCRv4’s advanced features.
from upsonic.ocr import OCR
from upsonic.ocr.paddleocr import PPChatOCRv4

# Create OCR with table and seal recognition
ocr = OCR(
    PPChatOCRv4,
    use_table_recognition=True,
    use_seal_recognition=True,
    lang='en'
)

# Extract visual information
visual_result = ocr.provider.visual_predict('invoice.pdf')

# Build vector index for retrieval
vector_info = ocr.provider.build_vector(
    visual_result,
    min_characters=3500,
    block_size=300
)

# Extract specific fields
invoice_data = ocr.provider.chat(
    key_list=[
        'invoice_number',
        'invoice_date',
        'vendor_name',
        'total_amount',
        'tax_amount',
        'line_items'
    ],
    visual_info=visual_result,
    use_vector_retrieval=True,
    vector_info=vector_info
)

# Display extracted information
print(f"Invoice Number: {invoice_data.get('invoice_number')}")
print(f"Date: {invoice_data.get('invoice_date')}")
print(f"Vendor: {invoice_data.get('vendor_name')}")
print(f"Total: {invoice_data.get('total_amount')}")
print(f"Tax: {invoice_data.get('tax_amount')}")
print(f"\nLine Items: {invoice_data.get('line_items')}")