print('🔴 DOCLING_PROCESSOR.PY: Starting import...') #!/usr/bin/env python3 from docling.document_converter import DocumentConverter from pathlib import Path import fitz # PyMuPDF as fallback import re def clean_text(text): """Clean text to ensure it's plaintext with no special characters or LaTeX""" if not text: return "" # Remove LaTeX commands and math expressions text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content} text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands # Remove special characters and normalize text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text) # Clean up multiple spaces and newlines text = re.sub(r'\s+', ' ', text) text = re.sub(r'\n\s*\n', '\n\n', text) return text.strip() def extract_text_with_docling(pdf_path, output_dir, document_name): """Extract text content from PDF using Docling with PyMuPDF fallback""" print(f"Extracting text content with Docling: {pdf_path}") try: # Initialize Docling converter converter = DocumentConverter() # Configure OCR for better text extraction converter.ocr_options.engine = "rapidocr" # Use faster OCR engine converter.ocr_options.do_ocr = True converter.ocr_options.do_table_ocr = True # Convert PDF to text result = converter.convert(pdf_path) # Get the text content text_content = result.document.export_to_markdown() # Clean the text to ensure it's plaintext text_content = clean_text(text_content) # Create processed directory structure if it doesn't exist processed_dir = Path("processed") / document_name processed_dir.mkdir(parents=True, exist_ok=True) # Save the text content to a file text_file = processed_dir / f"{document_name}_text_content.md" with open(text_file, 'w', encoding='utf-8') as f: f.write(text_content) print(f"✅ Text content extracted and saved to: {text_file}") return { 'text_content': text_content, 'text_file': text_file, 'processed_dir': processed_dir } except Exception as e: print(f"❌ Docling failed: {e}") print("🔄 Trying PyMuPDF fallback...") # Fallback to PyMuPDF try: text_content = extract_text_with_pymupdf(pdf_path) if text_content: # Clean the text to ensure it's plaintext text_content = clean_text(text_content) # Create processed directory structure if it doesn't exist processed_dir = Path("processed") / document_name processed_dir.mkdir(parents=True, exist_ok=True) # Save the text content to a file text_file = processed_dir / f"{document_name}_text_content.md" with open(text_file, 'w', encoding='utf-8') as f: f.write(text_content) print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}") return { 'text_content': text_content, 'text_file': text_file, 'processed_dir': processed_dir } else: print("⚠️ PyMuPDF fallback also failed") return None except Exception as fallback_error: print(f"❌ PyMuPDF fallback also failed: {fallback_error}") return None def extract_text_with_pymupdf(pdf_path): """Extract text using PyMuPDF as fallback with clean formatting""" try: doc = fitz.open(pdf_path) text_content = "" for page_num in range(len(doc)): page = doc[page_num] # Extract text with better formatting page_text = page.get_text() # Clean the page text page_text = clean_text(page_text) # Add page separator text_content += f"\n--- Page {page_num + 1} ---\n" text_content += page_text text_content += "\n" doc.close() return text_content except Exception as e: print(f"PyMuPDF extraction failed: {e}") return None def get_slide_text_content(text_content, slide_num): """Extract text content for a specific slide from the full document text""" try: if not text_content: return "" # Split by page separators pages = text_content.split('--- Page') # Find the page for this slide target_page = None for page in pages: if page.strip().startswith(f" {slide_num} ---"): target_page = page break if target_page: # Remove the page header and clean up lines = target_page.split('\n')[1:] # Remove page header slide_text = '\n'.join(lines).strip() # Further clean the slide text slide_text = clean_text(slide_text) return slide_text else: # Fallback: try to extract from sections sections = text_content.split('\n\n') if slide_num <= len(sections): return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0]) else: # Return a portion of the text content lines = text_content.split('\n') start_line = (slide_num - 1) * 5 # Approximate 5 lines per slide end_line = min(start_line + 10, len(lines)) # Up to 10 lines slide_text = '\n'.join(lines[start_line:end_line]) return clean_text(slide_text) except Exception as e: print(f"⚠️ Error extracting text for slide {slide_num}: {e}") return f"[Text content for slide {slide_num} could not be extracted]" print('🔴 DOCLING_PROCESSOR.PY: Import complete!')