technical-screen-2025-10-22/modules/docling_processor.py

175 lines
6.2 KiB
Python

print('🔴 DOCLING_PROCESSOR.PY: Starting import...')
#!/usr/bin/env python3
from docling.document_converter import DocumentConverter
from pathlib import Path
import fitz # PyMuPDF as fallback
import re
def clean_text(text):
"""Clean text to ensure it's plaintext with no special characters or LaTeX"""
if not text:
return ""
# Remove LaTeX commands and math expressions
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
# Remove special characters and normalize
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
# Clean up multiple spaces and newlines
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n\s*\n', '\n\n', text)
return text.strip()
def extract_text_with_docling(pdf_path, output_dir, document_name):
"""Extract text content from PDF using Docling with PyMuPDF fallback"""
print(f"Extracting text content with Docling: {pdf_path}")
try:
# Initialize Docling converter
converter = DocumentConverter()
# Configure OCR for better text extraction
converter.ocr_options.engine = "rapidocr" # Use faster OCR engine
converter.ocr_options.do_ocr = True
converter.ocr_options.do_table_ocr = True
# Convert PDF to text
result = converter.convert(pdf_path)
# Get the text content
text_content = result.document.export_to_markdown()
# Clean the text to ensure it's plaintext
text_content = clean_text(text_content)
# Create processed directory structure if it doesn't exist
processed_dir = Path("processed") / document_name
processed_dir.mkdir(parents=True, exist_ok=True)
# Save the text content to a file
text_file = processed_dir / f"{document_name}_text_content.md"
with open(text_file, 'w', encoding='utf-8') as f:
f.write(text_content)
print(f"✅ Text content extracted and saved to: {text_file}")
return {
'text_content': text_content,
'text_file': text_file,
'processed_dir': processed_dir
}
except Exception as e:
print(f"❌ Docling failed: {e}")
print("🔄 Trying PyMuPDF fallback...")
# Fallback to PyMuPDF
try:
text_content = extract_text_with_pymupdf(pdf_path)
if text_content:
# Clean the text to ensure it's plaintext
text_content = clean_text(text_content)
# Create processed directory structure if it doesn't exist
processed_dir = Path("processed") / document_name
processed_dir.mkdir(parents=True, exist_ok=True)
# Save the text content to a file
text_file = processed_dir / f"{document_name}_text_content.md"
with open(text_file, 'w', encoding='utf-8') as f:
f.write(text_content)
print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
return {
'text_content': text_content,
'text_file': text_file,
'processed_dir': processed_dir
}
else:
print("⚠️ PyMuPDF fallback also failed")
return None
except Exception as fallback_error:
print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
return None
def extract_text_with_pymupdf(pdf_path):
"""Extract text using PyMuPDF as fallback with clean formatting"""
try:
doc = fitz.open(pdf_path)
text_content = ""
for page_num in range(len(doc)):
page = doc[page_num]
# Extract text with better formatting
page_text = page.get_text()
# Clean the page text
page_text = clean_text(page_text)
# Add page separator
text_content += f"\n--- Page {page_num + 1} ---\n"
text_content += page_text
text_content += "\n"
doc.close()
return text_content
except Exception as e:
print(f"PyMuPDF extraction failed: {e}")
return None
def get_slide_text_content(text_content, slide_num):
"""Extract text content for a specific slide from the full document text"""
try:
if not text_content:
return ""
# Split by page separators
pages = text_content.split('--- Page')
# Find the page for this slide
target_page = None
for page in pages:
if page.strip().startswith(f" {slide_num} ---"):
target_page = page
break
if target_page:
# Remove the page header and clean up
lines = target_page.split('\n')[1:] # Remove page header
slide_text = '\n'.join(lines).strip()
# Further clean the slide text
slide_text = clean_text(slide_text)
return slide_text
else:
# Fallback: try to extract from sections
sections = text_content.split('\n\n')
if slide_num <= len(sections):
return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
else:
# Return a portion of the text content
lines = text_content.split('\n')
start_line = (slide_num - 1) * 5 # Approximate 5 lines per slide
end_line = min(start_line + 10, len(lines)) # Up to 10 lines
slide_text = '\n'.join(lines[start_line:end_line])
return clean_text(slide_text)
except Exception as e:
print(f"⚠️ Error extracting text for slide {slide_num}: {e}")
return f"[Text content for slide {slide_num} could not be extracted]"
print('🔴 DOCLING_PROCESSOR.PY: Import complete!')