173 lines
6.1 KiB
Python
173 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
from docling.document_converter import DocumentConverter
|
|
from pathlib import Path
|
|
import fitz # PyMuPDF as fallback
|
|
import re
|
|
|
|
|
|
def clean_text(text):
|
|
"""Clean text to ensure it's plaintext with no special characters or LaTeX"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Remove LaTeX commands and math expressions
|
|
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
|
|
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
|
|
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
|
|
|
|
# Remove special characters and normalize
|
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
|
|
|
|
# Clean up multiple spaces and newlines
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def extract_text_with_docling(pdf_path, output_dir, document_name):
|
|
"""Extract text content from PDF using Docling with PyMuPDF fallback"""
|
|
print(f"Extracting text content with Docling: {pdf_path}")
|
|
|
|
try:
|
|
# Initialize Docling converter
|
|
converter = DocumentConverter()
|
|
# Configure OCR for better text extraction
|
|
converter.ocr_options.engine = "rapidocr" # Use faster OCR engine
|
|
converter.ocr_options.do_ocr = True
|
|
converter.ocr_options.do_table_ocr = True
|
|
|
|
# Convert PDF to text
|
|
result = converter.convert(pdf_path)
|
|
|
|
# Get the text content
|
|
text_content = result.document.export_to_markdown()
|
|
|
|
# Clean the text to ensure it's plaintext
|
|
text_content = clean_text(text_content)
|
|
|
|
# Create processed directory structure if it doesn't exist
|
|
processed_dir = Path("processed") / document_name
|
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save the text content to a file
|
|
text_file = processed_dir / f"{document_name}_text_content.md"
|
|
with open(text_file, 'w', encoding='utf-8') as f:
|
|
f.write(text_content)
|
|
|
|
print(f"✅ Text content extracted and saved to: {text_file}")
|
|
|
|
return {
|
|
'text_content': text_content,
|
|
'text_file': text_file,
|
|
'processed_dir': processed_dir
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"❌ Docling failed: {e}")
|
|
print("🔄 Trying PyMuPDF fallback...")
|
|
|
|
# Fallback to PyMuPDF
|
|
try:
|
|
text_content = extract_text_with_pymupdf(pdf_path)
|
|
|
|
if text_content:
|
|
# Clean the text to ensure it's plaintext
|
|
text_content = clean_text(text_content)
|
|
|
|
# Create processed directory structure if it doesn't exist
|
|
processed_dir = Path("processed") / document_name
|
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save the text content to a file
|
|
text_file = processed_dir / f"{document_name}_text_content.md"
|
|
with open(text_file, 'w', encoding='utf-8') as f:
|
|
f.write(text_content)
|
|
|
|
print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
|
|
|
|
return {
|
|
'text_content': text_content,
|
|
'text_file': text_file,
|
|
'processed_dir': processed_dir
|
|
}
|
|
else:
|
|
print("⚠️ PyMuPDF fallback also failed")
|
|
return None
|
|
|
|
except Exception as fallback_error:
|
|
print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
|
|
return None
|
|
|
|
|
|
def extract_text_with_pymupdf(pdf_path):
|
|
"""Extract text using PyMuPDF as fallback with clean formatting"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
text_content = ""
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
|
|
# Extract text with better formatting
|
|
page_text = page.get_text()
|
|
|
|
# Clean the page text
|
|
page_text = clean_text(page_text)
|
|
|
|
# Add page separator
|
|
text_content += f"\n--- Page {page_num + 1} ---\n"
|
|
text_content += page_text
|
|
text_content += "\n"
|
|
|
|
doc.close()
|
|
return text_content
|
|
|
|
except Exception as e:
|
|
print(f"PyMuPDF extraction failed: {e}")
|
|
return None
|
|
|
|
|
|
def get_slide_text_content(text_content, slide_num):
|
|
"""Extract text content for a specific slide from the full document text"""
|
|
try:
|
|
if not text_content:
|
|
return ""
|
|
|
|
# Split by page separators
|
|
pages = text_content.split('--- Page')
|
|
|
|
# Find the page for this slide
|
|
target_page = None
|
|
for page in pages:
|
|
if page.strip().startswith(f" {slide_num} ---"):
|
|
target_page = page
|
|
break
|
|
|
|
if target_page:
|
|
# Remove the page header and clean up
|
|
lines = target_page.split('\n')[1:] # Remove page header
|
|
slide_text = '\n'.join(lines).strip()
|
|
|
|
# Further clean the slide text
|
|
slide_text = clean_text(slide_text)
|
|
|
|
return slide_text
|
|
else:
|
|
# Fallback: try to extract from sections
|
|
sections = text_content.split('\n\n')
|
|
if slide_num <= len(sections):
|
|
return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
|
|
else:
|
|
# Return a portion of the text content
|
|
lines = text_content.split('\n')
|
|
start_line = (slide_num - 1) * 5 # Approximate 5 lines per slide
|
|
end_line = min(start_line + 10, len(lines)) # Up to 10 lines
|
|
slide_text = '\n'.join(lines[start_line:end_line])
|
|
return clean_text(slide_text)
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Error extracting text for slide {slide_num}: {e}")
|
|
return f"[Text content for slide {slide_num} could not be extracted]"
|