technical-screen-2025-10-22/app.py

#!/usr/bin/env python3

print("🚀 APP.PY STARTING - IMMEDIATE FEEDBACK", flush=True)

import sys
import os
import re
import time
from pathlib import Path

print("📦 BASIC IMPORTS COMPLETE", flush=True)

def generate_toc(markdown_content):
    """Generate a Table of Contents from markdown headers"""
    print("  📋 Generating Table of Contents...", flush=True)
    lines = markdown_content.split('\n')
    toc_lines = []
    toc_lines.append("## Table of Contents")
    toc_lines.append("")

    header_count = 0
    for line in lines:
        # Match headers (##, ###, etc.)
        header_match = re.match(r'^(#{2,})\s+(.+)$', line)
        if header_match:
            header_count += 1
            level = len(header_match.group(1)) - 2  # Convert ## to 0, ### to 1, etc.
            title = header_match.group(2)

            # Create anchor link
            anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
            anchor = re.sub(r'\s+', '-', anchor.strip())

            # Add indentation based on header level
            indent = "  " * level
            toc_lines.append(f"{indent}- [{title}](#{anchor})")

    toc_lines.append("")
    toc_lines.append("---")
    toc_lines.append("")

    print(f"  ✅ Generated TOC with {header_count} headers", flush=True)
    return '\n'.join(toc_lines)

def main():
    """Simple pitch deck analyzer with comprehensive debugging"""
    print("🚀 PITCH DECK ANALYZER MAIN FUNCTION STARTING", flush=True)
    print("=" * 50, flush=True)

    if len(sys.argv) < 2:
        print("❌ Usage: python app.py <pdf_file>", flush=True)
        return

    pdf_path = sys.argv[1]
    if not os.path.exists(pdf_path):
        print(f"❌ Error: File '{pdf_path}' not found", flush=True)
        return

    print(f"📁 Processing file: {pdf_path}", flush=True)
    print(f"📁 File exists: {os.path.exists(pdf_path)}", flush=True)
    print(f"📁 File size: {os.path.getsize(pdf_path)} bytes", flush=True)

    # Import what we need directly (avoid __init__.py issues)
    print("\n📦 IMPORTING MODULES", flush=True)
    print("-" * 30, flush=True)

    sys.path.append('modules')

    print("  🔄 Importing client module...", flush=True)
    from client import get_openrouter_client
    print("  ✅ client module imported successfully", flush=True)

    print("  🔄 Importing pdf_processor module...", flush=True)
    from pdf_processor import extract_slides_from_pdf
    print("  ✅ pdf_processor module imported successfully", flush=True)

    print("  🔄 Importing analysis module...", flush=True)
    from analysis import analyze_slides_batch
    print("  ✅ analysis module imported successfully", flush=True)

    print("  🔄 Importing markdown_utils module...", flush=True)
    from markdown_utils import send_to_api_and_get_haste_link
    print("  ✅ markdown_utils module imported successfully", flush=True)

    print("✅ ALL MODULES IMPORTED SUCCESSFULLY", flush=True)

    # Extract slides
    print("\n📄 EXTRACTING SLIDES", flush=True)
    print("-" * 30, flush=True)
    print("  🔄 Calling extract_slides_from_pdf...", flush=True)
    start_time = time.time()

    slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
    extraction_time = time.time() - start_time
    print(f"  ✅ extract_slides_from_pdf completed in {extraction_time:.2f}s", flush=True)
    print(f"  📊 Extracted {len(slides)} slides", flush=True)

    # LIMIT TO FIRST 3 SLIDES FOR TESTING
    print(f"  🔄 Limiting to first 3 slides for testing...", flush=True)
    slides = slides[:3]
    print(f"  📊 Processing {len(slides)} slides", flush=True)

    # Analyze slides
    print("\n🧠 ANALYZING SLIDES", flush=True)
    print("-" * 30, flush=True)
    print("  🔄 Initializing API client...", flush=True)

    client = get_openrouter_client()
    print("  ✅ API client initialized successfully", flush=True)

    print("  🔄 Calling analyze_slides_batch...", flush=True)
    analysis_start_time = time.time()

    analysis_results = analyze_slides_batch(client, slides)
    analysis_time = time.time() - analysis_start_time
    print(f"  ✅ analyze_slides_batch completed in {analysis_time:.2f}s", flush=True)
    print(f"  📊 Analysis results: {len(analysis_results)} slides analyzed", flush=True)

    # Create report
    print("\n📝 CREATING REPORT", flush=True)
    print("-" * 30, flush=True)
    print("  🔄 Building markdown content...", flush=True)

    markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"

    # Add analysis metadata
    markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
    markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
    markdown_content += f"**Analysis Generated:** {len(slides)} slides processed (limited for testing)\n"
    markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
    markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"

    print(f"  📊 Building markdown for {len(slides)} slides...", flush=True)

    for i, slide_data in enumerate(slides):
        slide_num = i + 1
        print(f"    🔄 Processing slide {slide_num}/{len(slides)}...", flush=True)

        analysis = analysis_results.get(slide_num, {})

        markdown_content += f"# Slide {slide_num}\n\n"
        markdown_content += f"![Slide {slide_num}](slides/{slide_data['filename']})\n\n"

        if analysis:
            markdown_content += "## Agentic Analysis\n\n"

            # Format each agent's analysis
            agent_count = 0
            for agent_key, agent_data in analysis.items():
                if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
                    agent_count += 1
                    agent_name = agent_data['agent']
                    agent_analysis = agent_data['analysis']

                    markdown_content += f"### {agent_name}\n\n"
                    markdown_content += f"{agent_analysis}\n\n"

            print(f"    ✅ Added {agent_count} agent analyses for slide {slide_num}", flush=True)
        else:
            markdown_content += "## Agentic Analysis\n\n"
            markdown_content += "No analysis available\n\n"
            print(f"    ⚠️  No analysis available for slide {slide_num}", flush=True)

        markdown_content += "---\n\n"

    print("  ✅ Markdown content built successfully", flush=True)

    # Generate Table of Contents
    print("  🔄 Generating Table of Contents...", flush=True)
    toc = generate_toc(markdown_content)

    # Insert TOC after the main title
    print("  🔄 Inserting TOC into document...", flush=True)
    lines = markdown_content.split('\n')
    final_content = []
    final_content.append(lines[0])  # Main title
    final_content.append("")  # Empty line
    final_content.append(toc)  # TOC
    final_content.extend(lines[2:])  # Rest of content

    final_markdown = '\n'.join(final_content)
    print(f"  ✅ Final markdown created: {len(final_markdown)} characters", flush=True)

    # Save report
    print("\n💾 SAVING REPORT", flush=True)
    print("-" * 30, flush=True)
    output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
    print(f"  🔄 Saving to: {output_file}", flush=True)

    os.makedirs("processed", exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_markdown)
    print(f"  ✅ Report saved successfully ({len(final_markdown)} characters)", flush=True)

    # Always upload the report
    print("\n🌐 UPLOADING REPORT", flush=True)
    print("-" * 30, flush=True)
    print("  🔄 Calling send_to_api_and_get_haste_link...", flush=True)

    haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
    if haste_url:
        print(f"  ✅ Report uploaded successfully: {haste_url}", flush=True)
    else:
        print("  ❌ Upload failed - no URL returned", flush=True)

    print("\n🎉 PROCESSING COMPLETE!", flush=True)
    print("=" * 50, flush=True)

if __name__ == "__main__":
    print("🎯 __main__ BLOCK ENTERED", flush=True)
    main()