technical-screen-2025-10-22/modules/file_utils.py

112 lines
3.8 KiB
Python

#!/usr/bin/env python3
import subprocess
from pathlib import Path
def detect_file_type(file_path):
"""Detect file type based on extension"""
file_ext = Path(file_path).suffix.lower()
file_types = {
'.pdf': 'pdf',
'.pptx': 'powerpoint',
'.ppt': 'powerpoint',
'.docx': 'word',
'.doc': 'word',
'.odp': 'openoffice_presentation',
'.odt': 'openoffice_document'
}
return file_types.get(file_ext, 'unknown')
def convert_to_pdf(input_file, output_dir, document_name):
"""Convert various file types to PDF"""
file_type = detect_file_type(input_file)
if file_type == 'pdf':
print("✅ File is already PDF, no conversion needed")
return input_file
print(f"🔄 Converting {file_type} file to PDF...")
# Create temporary PDF file
temp_pdf = output_dir + "/" + f"{document_name}_temp.pdf"
try:
if file_type == 'powerpoint':
# Convert PowerPoint to PDF using pptxtopdf
print(" Using pptxtopdf for PowerPoint conversion...")
result = subprocess.run([
'python', '-c',
f'import pptxtopdf; pptxtopdf.convert("{input_file}", "{temp_pdf}")'
], capture_output=True, text=True, timeout=60)
if result.returncode != 0:
print(f"⚠️ pptxtopdf failed: {result.stderr}")
# Fallback: try using LibreOffice
return convert_with_libreoffice(input_file, temp_pdf, file_type)
elif file_type in ['word', 'openoffice_document']:
# Convert Word documents using LibreOffice
return convert_with_libreoffice(input_file, temp_pdf, file_type)
elif file_type == 'openoffice_presentation':
# Convert OpenOffice presentations using LibreOffice
return convert_with_libreoffice(input_file, temp_pdf, file_type)
else:
print(f"❌ Unsupported file type: {file_type}")
return None
if temp_pdf.exists():
print(f"✅ Successfully converted to PDF: {temp_pdf}")
return str(temp_pdf)
else:
print("❌ Conversion failed - PDF file not created")
return None
except subprocess.TimeoutExpired:
print("❌ Conversion timed out")
return None
except Exception as e:
print(f"❌ Conversion error: {e}")
return None
def convert_with_libreoffice(input_file, output_pdf, file_type):
"""Convert files using LibreOffice as fallback"""
try:
print(f" Using LibreOffice for {file_type} conversion...")
# LibreOffice command
cmd = [
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_pdf.parent),
str(input_file)
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode == 0:
# LibreOffice creates PDF with same name as input
input_name = Path(input_file).stem
libreoffice_pdf = os.path.dirname(output_pdf) + "/" + f"{input_name}.pdf"
if libreoffice_pdf.exists():
# Rename to our expected temp name
libreoffice_pdf.rename(output_pdf)
print(f"✅ LibreOffice conversion successful: {output_pdf}")
return str(output_pdf)
print(f"⚠️ LibreOffice conversion failed: {result.stderr}")
return None
except subprocess.TimeoutExpired:
print("❌ LibreOffice conversion timed out")
return None
except Exception as e:
print(f"❌ LibreOffice conversion error: {e}")
return None