153 lines
5.5 KiB
Python
153 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
def generate_toc(markdown_content):
|
|
"""Generate a Table of Contents from markdown headers"""
|
|
print(" 📋 Generating Table of Contents...")
|
|
lines = markdown_content.split('\n')
|
|
toc_lines = []
|
|
toc_lines.append("## Table of Contents")
|
|
toc_lines.append("")
|
|
|
|
header_count = 0
|
|
for line in lines:
|
|
# Match headers (##, ###, etc.)
|
|
header_match = re.match(r'^(#{2,})\s+(.+)$', line)
|
|
if header_match:
|
|
header_count += 1
|
|
level = len(header_match.group(1)) - 2 # Convert ## to 0, ### to 1, etc.
|
|
title = header_match.group(2)
|
|
|
|
# Create anchor link
|
|
anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
|
|
anchor = re.sub(r'\s+', '-', anchor.strip())
|
|
|
|
# Add indentation based on header level
|
|
indent = " " * level
|
|
toc_lines.append(f"{indent}- [{title}](#{anchor})")
|
|
|
|
toc_lines.append("")
|
|
toc_lines.append("---")
|
|
toc_lines.append("")
|
|
|
|
print(f" ✅ Generated TOC with {header_count} headers")
|
|
return '\n'.join(toc_lines)
|
|
|
|
def main():
|
|
"""Simple pitch deck analyzer"""
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python app.py <pdf_file>")
|
|
return
|
|
|
|
pdf_path = sys.argv[1]
|
|
if not os.path.exists(pdf_path):
|
|
print(f"Error: File '{pdf_path}' not found")
|
|
return
|
|
|
|
print(f"🚀 Processing: {pdf_path}")
|
|
|
|
# Import what we need directly (avoid __init__.py issues)
|
|
print("📦 Importing modules...")
|
|
sys.path.append('modules')
|
|
from client import get_openrouter_client
|
|
from pdf_processor import extract_slides_from_pdf
|
|
from analysis import analyze_slides_batch
|
|
from markdown_utils import send_to_api_and_get_haste_link
|
|
print("✅ Modules imported successfully")
|
|
|
|
# Extract slides
|
|
print("📄 Extracting slides...")
|
|
slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
|
|
print(f"✅ Extracted {len(slides)} slides")
|
|
|
|
# Analyze slides
|
|
print("🧠 Analyzing slides...")
|
|
client = get_openrouter_client()
|
|
print("🔗 API client initialized")
|
|
|
|
analysis_results = analyze_slides_batch(client, slides)
|
|
print("✅ Analysis complete")
|
|
|
|
# Create report
|
|
print("📝 Creating report...")
|
|
markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
|
|
|
|
# Add analysis metadata
|
|
markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
|
|
markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
|
|
markdown_content += f"**Analysis Generated:** {len(slides)} slides processed\n"
|
|
markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
|
|
markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"
|
|
|
|
print(f"📊 Building markdown for {len(slides)} slides...")
|
|
for i, slide_data in enumerate(slides):
|
|
slide_num = i + 1
|
|
analysis = analysis_results.get(slide_num, {})
|
|
|
|
print(f" 📄 Processing slide {slide_num}...")
|
|
|
|
markdown_content += f"# Slide {slide_num}\n\n"
|
|
markdown_content += f"\n\n"
|
|
|
|
if analysis:
|
|
markdown_content += "## Agentic Analysis\n\n"
|
|
|
|
# Format each agent's analysis
|
|
agent_count = 0
|
|
for agent_key, agent_data in analysis.items():
|
|
if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
|
|
agent_count += 1
|
|
agent_name = agent_data['agent']
|
|
agent_analysis = agent_data['analysis']
|
|
|
|
markdown_content += f"### {agent_name}\n\n"
|
|
markdown_content += f"{agent_analysis}\n\n"
|
|
|
|
print(f" ✅ Added {agent_count} agent analyses")
|
|
else:
|
|
markdown_content += "## Agentic Analysis\n\n"
|
|
markdown_content += "No analysis available\n\n"
|
|
print(f" ⚠️ No analysis available for slide {slide_num}")
|
|
|
|
markdown_content += "---\n\n"
|
|
|
|
# Generate Table of Contents
|
|
print("📋 Generating Table of Contents...")
|
|
toc = generate_toc(markdown_content)
|
|
|
|
# Insert TOC after the main title
|
|
print("🔗 Inserting TOC into document...")
|
|
lines = markdown_content.split('\n')
|
|
final_content = []
|
|
final_content.append(lines[0]) # Main title
|
|
final_content.append("") # Empty line
|
|
final_content.append(toc) # TOC
|
|
final_content.extend(lines[2:]) # Rest of content
|
|
|
|
final_markdown = '\n'.join(final_content)
|
|
|
|
# Save report
|
|
output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
|
|
print(f"💾 Saving report to: {output_file}")
|
|
os.makedirs("processed", exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(final_markdown)
|
|
|
|
print(f"✅ Report saved successfully ({len(final_markdown)} characters)")
|
|
|
|
# Always upload the report
|
|
print("🌐 Uploading report...")
|
|
haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
|
|
if haste_url:
|
|
print(f"✅ Report uploaded to: {haste_url}")
|
|
else:
|
|
print("❌ Upload failed")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|