63 lines
2.1 KiB
Python
63 lines
2.1 KiB
Python
print('🟢 PDF_PROCESSOR.PY: Starting import...')
|
|
#!/usr/bin/env python3
|
|
|
|
import base64
|
|
import fitz # PyMuPDF for PDF processing
|
|
from pathlib import Path
|
|
|
|
|
|
def extract_slides_from_pdf(pdf_path, output_dir, document_name):
|
|
"""Extract individual slides from PDF as images"""
|
|
print(f"Extracting slides from PDF: {pdf_path}")
|
|
|
|
# Create processed directory structure: ./processed/DocumentName/
|
|
processed_dir = Path("processed") / document_name
|
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create slides directory within processed directory
|
|
slides_dir = processed_dir / "slides"
|
|
slides_dir.mkdir(exist_ok=True)
|
|
|
|
slides = []
|
|
|
|
try:
|
|
# Open PDF with PyMuPDF
|
|
pdf_document = fitz.open(pdf_path)
|
|
|
|
for page_num in range(len(pdf_document)):
|
|
page = pdf_document[page_num]
|
|
|
|
# Convert page to image (high resolution)
|
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
|
pix = page.get_pixmap(matrix=mat)
|
|
|
|
# Save as PNG with document name prefix
|
|
slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
|
|
slide_path = slides_dir / slide_filename
|
|
|
|
pix.save(str(slide_path))
|
|
|
|
# Convert to base64 for API
|
|
img_data = pix.tobytes("png")
|
|
img_base64 = base64.b64encode(img_data).decode('utf-8')
|
|
|
|
slides.append({
|
|
'page_num': page_num + 1,
|
|
'filename': slide_filename,
|
|
'path': slide_path,
|
|
'base64': img_base64,
|
|
'document_name': document_name,
|
|
'processed_dir': processed_dir
|
|
})
|
|
|
|
print(f" Extracted slide {page_num + 1}")
|
|
|
|
pdf_document.close()
|
|
print(f"✅ Extracted {len(slides)} slides")
|
|
return slides
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error extracting slides: {e}")
|
|
return []
|
|
print('🟢 PDF_PROCESSOR.PY: Import complete!')
|