technical-screen-2025-10-22/modules/pdf_processor.py

63 lines
2.1 KiB
Python

print('🟢 PDF_PROCESSOR.PY: Starting import...')
#!/usr/bin/env python3
import base64
import fitz # PyMuPDF for PDF processing
from pathlib import Path
def extract_slides_from_pdf(pdf_path, output_dir, document_name):
"""Extract individual slides from PDF as images"""
print(f"Extracting slides from PDF: {pdf_path}")
# Create processed directory structure: ./processed/DocumentName/
processed_dir = Path("processed") / document_name
processed_dir.mkdir(parents=True, exist_ok=True)
# Create slides directory within processed directory
slides_dir = processed_dir / "slides"
slides_dir.mkdir(exist_ok=True)
slides = []
try:
# Open PDF with PyMuPDF
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Convert page to image (high resolution)
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
pix = page.get_pixmap(matrix=mat)
# Save as PNG with document name prefix
slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
slide_path = slides_dir / slide_filename
pix.save(str(slide_path))
# Convert to base64 for API
img_data = pix.tobytes("png")
img_base64 = base64.b64encode(img_data).decode('utf-8')
slides.append({
'page_num': page_num + 1,
'filename': slide_filename,
'path': slide_path,
'base64': img_base64,
'document_name': document_name,
'processed_dir': processed_dir
})
print(f" Extracted slide {page_num + 1}")
pdf_document.close()
print(f"✅ Extracted {len(slides)} slides")
return slides
except Exception as e:
print(f"❌ Error extracting slides: {e}")
return []
print('🟢 PDF_PROCESSOR.PY: Import complete!')