technical-screen-2025-10-22/modules/docling_processor.py

#!/usr/bin/env python3

from docling.document_converter import DocumentConverter
from pathlib import Path
import fitz  # PyMuPDF as fallback
import re


def clean_text(text):
    """Clean text to ensure it's plaintext with no special characters or LaTeX"""
    if not text:
        return ""

    # Remove LaTeX commands and math expressions
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands

    # Remove special characters and normalize
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)

    # Clean up multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)

    return text.strip()


def extract_text_with_docling(pdf_path, output_dir, document_name):
    """Extract text content from PDF using Docling with PyMuPDF fallback"""
    print(f"Extracting text content with Docling: {pdf_path}")

    try:
        # Initialize Docling converter
        converter = DocumentConverter()
        # Configure OCR for better text extraction
        converter.ocr_options.engine = "rapidocr"  # Use faster OCR engine
        converter.ocr_options.do_ocr = True
        converter.ocr_options.do_table_ocr = True

        # Convert PDF to text
        result = converter.convert(pdf_path)

        # Get the text content
        text_content = result.document.export_to_markdown()

        # Clean the text to ensure it's plaintext
        text_content = clean_text(text_content)

        # Create processed directory structure if it doesn't exist
        processed_dir = Path("processed") / document_name
        processed_dir.mkdir(parents=True, exist_ok=True)

        # Save the text content to a file
        text_file = processed_dir / f"{document_name}_text_content.md"
        with open(text_file, 'w', encoding='utf-8') as f:
            f.write(text_content)

        print(f"✅ Text content extracted and saved to: {text_file}")

        return {
            'text_content': text_content,
            'text_file': text_file,
            'processed_dir': processed_dir
        }

    except Exception as e:
        print(f"❌ Docling failed: {e}")
        print("🔄 Trying PyMuPDF fallback...")

        # Fallback to PyMuPDF
        try:
            text_content = extract_text_with_pymupdf(pdf_path)

            if text_content:
                # Clean the text to ensure it's plaintext
                text_content = clean_text(text_content)

                # Create processed directory structure if it doesn't exist
                processed_dir = Path("processed") / document_name
                processed_dir.mkdir(parents=True, exist_ok=True)

                # Save the text content to a file
                text_file = processed_dir / f"{document_name}_text_content.md"
                with open(text_file, 'w', encoding='utf-8') as f:
                    f.write(text_content)

                print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")

                return {
                    'text_content': text_content,
                    'text_file': text_file,
                    'processed_dir': processed_dir
                }
            else:
                print("⚠️  PyMuPDF fallback also failed")
                return None

        except Exception as fallback_error:
            print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
            return None


def extract_text_with_pymupdf(pdf_path):
    """Extract text using PyMuPDF as fallback with clean formatting"""
    try:
        doc = fitz.open(pdf_path)
        text_content = ""

        for page_num in range(len(doc)):
            page = doc[page_num]

            # Extract text with better formatting
            page_text = page.get_text()

            # Clean the page text
            page_text = clean_text(page_text)

            # Add page separator
            text_content += f"\n--- Page {page_num + 1} ---\n"
            text_content += page_text
            text_content += "\n"

        doc.close()
        return text_content

    except Exception as e:
        print(f"PyMuPDF extraction failed: {e}")
        return None


def get_slide_text_content(text_content, slide_num):
    """Extract text content for a specific slide from the full document text"""
    try:
        if not text_content:
            return ""

        # Split by page separators
        pages = text_content.split('--- Page')

        # Find the page for this slide
        target_page = None
        for page in pages:
            if page.strip().startswith(f" {slide_num} ---"):
                target_page = page
                break

        if target_page:
            # Remove the page header and clean up
            lines = target_page.split('\n')[1:]  # Remove page header
            slide_text = '\n'.join(lines).strip()

            # Further clean the slide text
            slide_text = clean_text(slide_text)

            return slide_text
        else:
            # Fallback: try to extract from sections
            sections = text_content.split('\n\n')
            if slide_num <= len(sections):
                return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
            else:
                # Return a portion of the text content
                lines = text_content.split('\n')
                start_line = (slide_num - 1) * 5  # Approximate 5 lines per slide
                end_line = min(start_line + 10, len(lines))  # Up to 10 lines
                slide_text = '\n'.join(lines[start_line:end_line])
                return clean_text(slide_text)

    except Exception as e:
        print(f"⚠️  Error extracting text for slide {slide_num}: {e}")
        return f"[Text content for slide {slide_num} could not be extracted]"