Initial commit: Technical screen project with document analysis capabilities

2025-10-22 18:55:39 -04:00 · 2025-10-22 18:55:39 -04:00 · 0bb86c677d
commit 0bb86c677d
40 changed files with 5857 additions and 0 deletions
--- a/.cursor/rules/code-cleanup.mdc
+++ b/.cursor/rules/code-cleanup.mdc
@ -0,0 +1,5 @@
 ---
 alwaysApply: true
 ---
 # Code Cleanup Guidelines
 Remove unused code, imports, and dead functions to keep the codebase clean and maintainable. Regular cleanup prevents technical debt and improves code readability.
--- a/.cursor/rules/code-length.mdc
+++ b/.cursor/rules/code-length.mdc
@ -0,0 +1,5 @@
 ---
 alwaysApply: true
 ---
 # Code Length Guidelines
 Keep all code files under 300 lines for better maintainability and readability. If a file exceeds this limit, consider breaking it into smaller, focused modules.
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,55 @@
 # Environment variables
 .env
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # Virtual environments
 venv/
 env/
 ENV/
 env.bak/
 venv.bak/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 # Logs
 *.log
 # Temporary files
 *.tmp
 *.temp
--- a/app.py
+++ b/app.py
@ -0,0 +1,152 @@
 #!/usr/bin/env python3
 import sys
 import os
 import re
 from pathlib import Path
 def generate_toc(markdown_content):
    """Generate a Table of Contents from markdown headers"""
    print("  📋 Generating Table of Contents...")
    lines = markdown_content.split('\n')
    toc_lines = []
    toc_lines.append("## Table of Contents")
    toc_lines.append("")
    header_count = 0
    for line in lines:
        # Match headers (##, ###, etc.)
        header_match = re.match(r'^(#{2,})\s+(.+)$', line)
        if header_match:
            header_count += 1
            level = len(header_match.group(1)) - 2  # Convert ## to 0, ### to 1, etc.
            title = header_match.group(2)
            # Create anchor link
            anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
            anchor = re.sub(r'\s+', '-', anchor.strip())
            # Add indentation based on header level
            indent = "  " * level
            toc_lines.append(f"{indent}- [{title}](#{anchor})")
    toc_lines.append("")
    toc_lines.append("---")
    toc_lines.append("")
    print(f"  ✅ Generated TOC with {header_count} headers")
    return '\n'.join(toc_lines)
 def main():
    """Simple pitch deck analyzer"""
    if len(sys.argv) < 2:
        print("Usage: python app.py <pdf_file>")
        return
    pdf_path = sys.argv[1]
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found")
        return
    print(f"🚀 Processing: {pdf_path}")
    # Import what we need directly (avoid __init__.py issues)
    print("📦 Importing modules...")
    sys.path.append('modules')
    from client import get_openrouter_client
    from pdf_processor import extract_slides_from_pdf
    from analysis import analyze_slides_batch
    from markdown_utils import send_to_api_and_get_haste_link
    print("✅ Modules imported successfully")
    # Extract slides
    print("📄 Extracting slides...")
    slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
    print(f"✅ Extracted {len(slides)} slides")
    # Analyze slides
    print("🧠 Analyzing slides...")
    client = get_openrouter_client()
    print("🔗 API client initialized")
    analysis_results = analyze_slides_batch(client, slides)
    print("✅ Analysis complete")
    # Create report
    print("📝 Creating report...")
    markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
    # Add analysis metadata
    markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
    markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
    markdown_content += f"**Analysis Generated:** {len(slides)} slides processed\n"
    markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
    markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"
    print(f"📊 Building markdown for {len(slides)} slides...")
    for i, slide_data in enumerate(slides):
        slide_num = i + 1
        analysis = analysis_results.get(slide_num, {})
        print(f"  📄 Processing slide {slide_num}...")
        markdown_content += f"# Slide {slide_num}\n\n"
        markdown_content += f"![Slide {slide_num}](slides/{slide_data['filename']})\n\n"
        if analysis:
            markdown_content += "## Agentic Analysis\n\n"
            # Format each agent's analysis
            agent_count = 0
            for agent_key, agent_data in analysis.items():
                if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
                    agent_count += 1
                    agent_name = agent_data['agent']
                    agent_analysis = agent_data['analysis']
                    markdown_content += f"### {agent_name}\n\n"
                    markdown_content += f"{agent_analysis}\n\n"
            print(f"    ✅ Added {agent_count} agent analyses")
        else:
            markdown_content += "## Agentic Analysis\n\n"
            markdown_content += "No analysis available\n\n"
            print(f"    ⚠️  No analysis available for slide {slide_num}")
        markdown_content += "---\n\n"
    # Generate Table of Contents
    print("📋 Generating Table of Contents...")
    toc = generate_toc(markdown_content)
    # Insert TOC after the main title
    print("🔗 Inserting TOC into document...")
    lines = markdown_content.split('\n')
    final_content = []
    final_content.append(lines[0])  # Main title
    final_content.append("")  # Empty line
    final_content.append(toc)  # TOC
    final_content.extend(lines[2:])  # Rest of content
    final_markdown = '\n'.join(final_content)
    # Save report
    output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
    print(f"💾 Saving report to: {output_file}")
    os.makedirs("processed", exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_markdown)
    print(f"✅ Report saved successfully ({len(final_markdown)} characters)")
    # Always upload the report
    print("🌐 Uploading report...")
    haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
    if haste_url:
        print(f"✅ Report uploaded to: {haste_url}")
    else:
        print("❌ Upload failed")
 if __name__ == "__main__":
    main()
--- a/example.env
+++ b/example.env
@ -0,0 +1,5 @@
 # OpenRouter API Configuration
 OPENROUTER_API_KEY=your_openrouter_api_key_here
 # Optional: Custom OpenAI model (defaults to gpt-3.5-turbo)
 # OPENROUTER_MODEL=openai/gpt-3.5-turbo
--- a/modules/init.py
+++ b/modules/init.py
@ -0,0 +1,85 @@
 #!/usr/bin/env python3
 # Pitch Deck Parser Modules
 # This package contains all the modular components for the pitch deck analysis application
 from .client import get_openrouter_client
 from .file_utils import detect_file_type, convert_to_pdf, convert_with_libreoffice
 from .pdf_processor import extract_slides_from_pdf
 from .docling_processor import extract_text_with_docling, get_slide_text_content
 from .analysis import (
    analyze_slide_with_single_prompt,
    analyze_slides_batch,
    analyze_slide_with_agentic_prompts_parallel,
    process_single_slide_parallel
 )
 from .markdown_utils import (
    create_slide_markdown,
    create_text_only_markdown,
    send_to_api_and_get_haste_link
 )
 __all__ = [
    'get_openrouter_client',
    'detect_file_type',
    'convert_to_pdf',
    'convert_with_libreoffice',
    'extract_slides_from_pdf',
    'extract_text_with_docling',
    'get_slide_text_content',
    'analyze_slide_with_single_prompt',
    'analyze_slides_batch',
    'analyze_slide_with_agentic_prompts_parallel',
    'process_single_slide_parallel',
    'create_slide_markdown',
    'create_text_only_markdown',
    'send_to_api_and_get_haste_link'
 ]
 # Market Cap RAG Validation
 from .rag_agent import MarketCapRAGAgent, MarketCapClaim, ValidationResult
 from .validation_report import ValidationReportGenerator
 from .market_cap_validator import (
    MarketCapValidator,
    validate_market_caps,
    validate_market_caps_from_file,
    validate_market_caps_from_processed
 )
 # Update __all__ list
 __all__.extend([
    'MarketCapRAGAgent',
    'MarketCapClaim', 
    'ValidationResult',
    'ValidationReportGenerator',
    'MarketCapValidator',
    'validate_market_caps',
    'validate_market_caps_from_file',
    'validate_market_caps_from_processed'
 ])
 # Document-specific validation
 from .document_validator import (
    DocumentValidator,
    validate_document_claims,
    validate_all_processed_documents
 )
 # Update __all__ list
 __all__.extend([
    'DocumentValidator',
    'validate_document_claims',
    'validate_all_processed_documents'
 ])
 # Main application and CLI tools
 from .app import *
 from .example_usage import *
 from .validate_market_caps import *
 # Update __all__ list
 __all__.extend([
    'app',
    'example_usage', 
    'validate_market_caps'
 ])
--- a/modules/analysis.py
+++ b/modules/analysis.py
@ -0,0 +1,90 @@
 import re
 from client import get_openrouter_client
 def analyze_slides_batch(client, slides_data, batch_size=1):
    """Process slides individually with specialized AI agents"""
    print(f"  Processing {len(slides_data)} slides individually...")
    all_results = {}
    for i, slide_data in enumerate(slides_data):
        slide_num = slide_data["page_num"]
        print(f"    🔍 Analyzing slide {slide_num} ({i+1}/{len(slides_data)})...")
        # Define specialized agents
        agents = {
            'content_extractor': {
                'name': 'Content Extractor',
                'prompt': 'Extract and summarize the key textual content from this slide. Focus on headlines, bullet points, and main messages.'
            },
            'visual_analyzer': {
                'name': 'Visual Analyzer', 
                'prompt': 'Analyze the visual design elements of this slide. Comment on layout, colors, typography, and visual hierarchy.'
            },
            'data_interpreter': {
                'name': 'Data Interpreter',
                'prompt': 'Identify and interpret any numerical data, charts, graphs, or metrics present on this slide.'
            },
            'message_evaluator': {
                'name': 'Message Evaluator',
                'prompt': 'Evaluate the effectiveness of the message delivery and communication strategy on this slide.'
            },
            'improvement_suggestor': {
                'name': 'Improvement Suggestor',
                'prompt': 'Suggest specific improvements for this slide in terms of clarity, impact, and effectiveness.'
            }
        }
        slide_analysis = {}
        # Analyze with each specialized agent
        for j, (agent_key, agent_config) in enumerate(agents.items()):
            print(f"      🤖 Running {agent_config['name']} ({j+1}/5)...")
            messages = [
                {
                    "role": "system",
                    "content": f"You are a {agent_config['name']} specialized in analyzing pitch deck slides. {agent_config['prompt']}"
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": f"Analyze slide {slide_num}:"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{slide_data['base64']}"
                            }
                        }
                    ]
                }
            ]
            try:
                print(f"        📡 Sending API request...")
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages,
                    max_tokens=500
                )
                analysis = response.choices[0].message.content.strip()
                print(f"        ✅ {agent_config['name']} completed ({len(analysis)} chars)")
                slide_analysis[agent_key] = {
                    'agent': agent_config['name'],
                    'analysis': analysis
                }
            except Exception as e:
                print(f"        ❌ {agent_config['name']} failed: {str(e)}")
                slide_analysis[agent_key] = {
                    'agent': agent_config['name'],
                    'analysis': f"Error analyzing slide {slide_num}: {str(e)}"
                }
        all_results[slide_num] = slide_analysis
        print(f"    ✅ Slide {slide_num} analysis complete")
    print(f"  🎉 All {len(slides_data)} slides analyzed successfully!")
    return all_results
--- a/modules/client.py
+++ b/modules/client.py
@ -0,0 +1,23 @@
 #!/usr/bin/env python3
 import os
 import sys
 from openai import OpenAI
 from dotenv import load_dotenv
 def get_openrouter_client():
    """Initialize OpenRouter client with API key from .env file"""
    # Load .env file
    load_dotenv()
    api_key = os.getenv('OPENROUTER_API_KEY')
    if not api_key or api_key == 'your_openrouter_api_key_here':
        print("❌ Error: OPENROUTER_API_KEY not properly set in .env file")
        print("Please update your .env file with a valid OpenRouter API key")
        sys.exit(1)
    return OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=api_key
    )
--- a/modules/docling_processor.py
+++ b/modules/docling_processor.py
@ -0,0 +1,172 @@
 #!/usr/bin/env python3
 from docling.document_converter import DocumentConverter
 from pathlib import Path
 import fitz  # PyMuPDF as fallback
 import re
 def clean_text(text):
    """Clean text to ensure it's plaintext with no special characters or LaTeX"""
    if not text:
        return ""
    # Remove LaTeX commands and math expressions
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands
    # Remove special characters and normalize
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
    # Clean up multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()
 def extract_text_with_docling(pdf_path, output_dir, document_name):
    """Extract text content from PDF using Docling with PyMuPDF fallback"""
    print(f"Extracting text content with Docling: {pdf_path}")
    try:
        # Initialize Docling converter
        converter = DocumentConverter()
        # Configure OCR for better text extraction
        converter.ocr_options.engine = "rapidocr"  # Use faster OCR engine
        converter.ocr_options.do_ocr = True
        converter.ocr_options.do_table_ocr = True
        # Convert PDF to text
        result = converter.convert(pdf_path)
        # Get the text content
        text_content = result.document.export_to_markdown()
        # Clean the text to ensure it's plaintext
        text_content = clean_text(text_content)
        # Create processed directory structure if it doesn't exist
        processed_dir = Path("processed") / document_name
        processed_dir.mkdir(parents=True, exist_ok=True)
        # Save the text content to a file
        text_file = processed_dir / f"{document_name}_text_content.md"
        with open(text_file, 'w', encoding='utf-8') as f:
            f.write(text_content)
        print(f"✅ Text content extracted and saved to: {text_file}")
        return {
            'text_content': text_content,
            'text_file': text_file,
            'processed_dir': processed_dir
        }
    except Exception as e:
        print(f"❌ Docling failed: {e}")
        print("🔄 Trying PyMuPDF fallback...")
        # Fallback to PyMuPDF
        try:
            text_content = extract_text_with_pymupdf(pdf_path)
            if text_content:
                # Clean the text to ensure it's plaintext
                text_content = clean_text(text_content)
                # Create processed directory structure if it doesn't exist
                processed_dir = Path("processed") / document_name
                processed_dir.mkdir(parents=True, exist_ok=True)
                # Save the text content to a file
                text_file = processed_dir / f"{document_name}_text_content.md"
                with open(text_file, 'w', encoding='utf-8') as f:
                    f.write(text_content)
                print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
                return {
                    'text_content': text_content,
                    'text_file': text_file,
                    'processed_dir': processed_dir
                }
            else:
                print("⚠️  PyMuPDF fallback also failed")
                return None
        except Exception as fallback_error:
            print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
            return None
 def extract_text_with_pymupdf(pdf_path):
    """Extract text using PyMuPDF as fallback with clean formatting"""
    try:
        doc = fitz.open(pdf_path)
        text_content = ""
        for page_num in range(len(doc)):
            page = doc[page_num]
            # Extract text with better formatting
            page_text = page.get_text()
            # Clean the page text
            page_text = clean_text(page_text)
            # Add page separator
            text_content += f"\n--- Page {page_num + 1} ---\n"
            text_content += page_text
            text_content += "\n"
        doc.close()
        return text_content
    except Exception as e:
        print(f"PyMuPDF extraction failed: {e}")
        return None
 def get_slide_text_content(text_content, slide_num):
    """Extract text content for a specific slide from the full document text"""
    try:
        if not text_content:
            return ""
        # Split by page separators
        pages = text_content.split('--- Page')
        # Find the page for this slide
        target_page = None
        for page in pages:
            if page.strip().startswith(f" {slide_num} ---"):
                target_page = page
                break
        if target_page:
            # Remove the page header and clean up
            lines = target_page.split('\n')[1:]  # Remove page header
            slide_text = '\n'.join(lines).strip()
            # Further clean the slide text
            slide_text = clean_text(slide_text)
            return slide_text
        else:
            # Fallback: try to extract from sections
            sections = text_content.split('\n\n')
            if slide_num <= len(sections):
                return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
            else:
                # Return a portion of the text content
                lines = text_content.split('\n')
                start_line = (slide_num - 1) * 5  # Approximate 5 lines per slide
                end_line = min(start_line + 10, len(lines))  # Up to 10 lines
                slide_text = '\n'.join(lines[start_line:end_line])
                return clean_text(slide_text)
    except Exception as e:
        print(f"⚠️  Error extracting text for slide {slide_num}: {e}")
        return f"[Text content for slide {slide_num} could not be extracted]"
--- a/modules/document_validator.py
+++ b/modules/document_validator.py
@ -0,0 +1,199 @@
 #!/usr/bin/env python3
 """
 Document-specific validator that organizes reports by document in processed directory
 """
 import os
 import json
 from typing import List, Dict, Any, Optional
 from .rag_agent import MarketCapRAGAgent
 from .validation_report import ValidationReportGenerator
 class DocumentValidator:
    """
    Validates financial claims for specific documents with proper directory organization
    """
    def __init__(self, api_key: Optional[str] = None):
        self.rag_agent = MarketCapRAGAgent(api_key)
        self.report_generator = ValidationReportGenerator()
    def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]], 
                         save_report: bool = True) -> Dict[str, Any]:
        """
        Validate financial claims for a specific document
        Args:
            document_name: Name of the document (e.g., "Uber-Pitch-Deck")
            slide_texts: List of slide data with 'slide_number' and 'text' keys
            save_report: Whether to save the validation report to file
        Returns:
            Dictionary containing validation results and report
        """
        print(f"🔍 Validating financial claims for: {document_name}")
        # Extract and validate claims
        validation_results = self.rag_agent.validate_all_claims(slide_texts)
        # Generate report
        report = self.report_generator.generate_report(validation_results, slide_texts)
        # Save report in proper directory structure
        report_filename = None
        if save_report:
            # Create document-specific directory
            doc_dir = os.path.join("processed", document_name)
            os.makedirs(doc_dir, exist_ok=True)
            # Save report in document directory
            report_filename = self.report_generator.save_report(
                report, 
                f"{document_name}_market_cap_validation.md",
                doc_dir
            )
            print(f"📄 Validation report saved to: {report_filename}")
        # Prepare summary
        summary = self._generate_summary(validation_results)
        return {
            'document_name': document_name,
            'validation_results': validation_results,
            'report': report,
            'report_filename': report_filename,
            'summary': summary
        }
    def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
        """
        Validate all documents in the processed folder
        Args:
            folder_path: Path to processed folder
        Returns:
            Dictionary with results for each document
        """
        results = {}
        if not os.path.exists(folder_path):
            raise ValueError(f"Processed folder not found: {folder_path}")
        # Find all document directories
        for item in os.listdir(folder_path):
            item_path = os.path.join(folder_path, item)
            if os.path.isdir(item_path) and not item.startswith('.'):
                # Look for text content files
                text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
                if text_files:
                    document_name = item
                    text_file = os.path.join(item_path, text_files[0])
                    print(f"📁 Processing document: {document_name}")
                    # Read text content
                    with open(text_file, 'r', encoding='utf-8') as f:
                        content = f.read()
                    # Convert to slide format
                    slide_texts = [{
                        "slide_number": 1,
                        "text": content
                    }]
                    # Validate document
                    try:
                        doc_results = self.validate_document(document_name, slide_texts)
                        results[document_name] = doc_results
                    except Exception as e:
                        print(f"❌ Error processing {document_name}: {e}")
                        results[document_name] = {'error': str(e)}
        return results
    def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
        """Generate a summary of validation results"""
        total_claims = len(validation_results)
        accurate_claims = sum(1 for r in validation_results if r.is_accurate)
        inaccurate_claims = total_claims - accurate_claims
        return {
            'total_claims': total_claims,
            'accurate_claims': accurate_claims,
            'inaccurate_claims': inaccurate_claims,
            'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
            'claims_by_slide': self._group_claims_by_slide(validation_results)
        }
    def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
        """Group claims by slide number"""
        claims_by_slide = {}
        for result in validation_results:
            slide_num = result.claim.slide_number
            if slide_num not in claims_by_slide:
                claims_by_slide[slide_num] = []
            claims_by_slide[slide_num].append(result)
        return claims_by_slide
 def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]], 
                           api_key: Optional[str] = None,
                           save_report: bool = True) -> Dict[str, Any]:
    """
    Convenience function to validate claims for a specific document
    Args:
        document_name: Name of the document
        slide_texts: List of slide data
        api_key: OpenRouter API key (optional)
        save_report: Whether to save the validation report to file
    Returns:
        Dictionary containing validation results and report
    """
    validator = DocumentValidator(api_key)
    return validator.validate_document(document_name, slide_texts, save_report)
 def validate_all_processed_documents(folder_path: str = "processed",
                                   api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Convenience function to validate all documents in processed folder
    Args:
        folder_path: Path to processed folder
        api_key: OpenRouter API key (optional)
    Returns:
        Dictionary with results for each document
    """
    validator = DocumentValidator(api_key)
    return validator.validate_from_processed_folder(folder_path)
 if __name__ == "__main__":
    # Example usage
    print("Document Validator - RAG Agent")
    print("===============================")
    try:
        results = validate_all_processed_documents()
        print(f"\n✅ Validation Complete!")
        print(f"📊 Processed {len(results)} documents:")
        for doc_name, doc_results in results.items():
            if 'error' in doc_results:
                print(f"   ❌ {doc_name}: {doc_results['error']}")
            else:
                summary = doc_results['summary']
                print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
                if doc_results['report_filename']:
                    print(f"      📄 Report: {doc_results['report_filename']}")
    except Exception as e:
        print(f"❌ Error: {e}")
--- a/modules/file_utils.py
+++ b/modules/file_utils.py
@ -0,0 +1,111 @@
 #!/usr/bin/env python3
 import subprocess
 from pathlib import Path
 def detect_file_type(file_path):
    """Detect file type based on extension"""
    file_ext = Path(file_path).suffix.lower()
    file_types = {
        '.pdf': 'pdf',
        '.pptx': 'powerpoint',
        '.ppt': 'powerpoint',
        '.docx': 'word',
        '.doc': 'word',
        '.odp': 'openoffice_presentation',
        '.odt': 'openoffice_document'
    }
    return file_types.get(file_ext, 'unknown')
 def convert_to_pdf(input_file, output_dir, document_name):
    """Convert various file types to PDF"""
    file_type = detect_file_type(input_file)
    if file_type == 'pdf':
        print("✅ File is already PDF, no conversion needed")
        return input_file
    print(f"🔄 Converting {file_type} file to PDF...")
    # Create temporary PDF file
    temp_pdf = output_dir + "/" + f"{document_name}_temp.pdf"
    try:
        if file_type == 'powerpoint':
            # Convert PowerPoint to PDF using pptxtopdf
            print("  Using pptxtopdf for PowerPoint conversion...")
            result = subprocess.run([
                'python', '-c', 
                f'import pptxtopdf; pptxtopdf.convert("{input_file}", "{temp_pdf}")'
            ], capture_output=True, text=True, timeout=60)
            if result.returncode != 0:
                print(f"⚠️  pptxtopdf failed: {result.stderr}")
                # Fallback: try using LibreOffice
                return convert_with_libreoffice(input_file, temp_pdf, file_type)
        elif file_type in ['word', 'openoffice_document']:
            # Convert Word documents using LibreOffice
            return convert_with_libreoffice(input_file, temp_pdf, file_type)
        elif file_type == 'openoffice_presentation':
            # Convert OpenOffice presentations using LibreOffice
            return convert_with_libreoffice(input_file, temp_pdf, file_type)
        else:
            print(f"❌ Unsupported file type: {file_type}")
            return None
        if temp_pdf.exists():
            print(f"✅ Successfully converted to PDF: {temp_pdf}")
            return str(temp_pdf)
        else:
            print("❌ Conversion failed - PDF file not created")
            return None
    except subprocess.TimeoutExpired:
        print("❌ Conversion timed out")
        return None
    except Exception as e:
        print(f"❌ Conversion error: {e}")
        return None
 def convert_with_libreoffice(input_file, output_pdf, file_type):
    """Convert files using LibreOffice as fallback"""
    try:
        print(f"  Using LibreOffice for {file_type} conversion...")
        # LibreOffice command
        cmd = [
            'soffice', '--headless', '--convert-to', 'pdf',
            '--outdir', str(output_pdf.parent),
            str(input_file)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
        if result.returncode == 0:
            # LibreOffice creates PDF with same name as input
            input_name = Path(input_file).stem
            libreoffice_pdf = os.path.dirname(output_pdf) + "/" + f"{input_name}.pdf"
            if libreoffice_pdf.exists():
                # Rename to our expected temp name
                libreoffice_pdf.rename(output_pdf)
                print(f"✅ LibreOffice conversion successful: {output_pdf}")
                return str(output_pdf)
        print(f"⚠️  LibreOffice conversion failed: {result.stderr}")
        return None
    except subprocess.TimeoutExpired:
        print("❌ LibreOffice conversion timed out")
        return None
    except Exception as e:
        print(f"❌ LibreOffice conversion error: {e}")
        return None
--- a/modules/markdown_utils.py
+++ b/modules/markdown_utils.py
@ -0,0 +1,173 @@
 #!/usr/bin/env python3
 import re
 import requests
 import json
 def clean_markdown_text(text):
    """Clean markdown text to ensure it's plaintext with no special characters"""
    if not text:
        return ""
    # Remove LaTeX commands and math expressions
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands
    # Remove markdown formatting but keep the text
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # Remove bold **text**
    text = re.sub(r'\*([^*]+)\*', r'\1', text)  # Remove italic *text*
    text = re.sub(r'`([^`]+)`', r'\1', text)  # Remove code `text`
    text = re.sub(r'#{1,6}\s*', '', text)  # Remove headers # ## ###
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
    # Clean up multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()
 def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
    """Create markdown content for a single slide with all agentic analyses and text content"""
    markdown = f"""# Slide {slide_num}
 ![Slide {slide_num}](slides/{slide_data['filename']})
 """
    # Add text content if available
    if slide_text and slide_text.strip():
        # Clean the slide text to ensure it's plaintext
        clean_slide_text = clean_markdown_text(slide_text)
        markdown += f"""## Text Content
 {clean_slide_text}
 """
    markdown += """## Agentic Analysis
 """
    for prompt_key, result in analysis_results.items():
        # Clean the analysis text to ensure it's plaintext
        clean_analysis = clean_markdown_text(result['analysis'])
        markdown += f"""### {result['agent']}
 {clean_analysis}
 """
    markdown += "---\n\n"
    return markdown
 def create_text_only_markdown(markdown_content):
    """Create a text-only version of markdown without image references for API submission"""
    # Remove image markdown blocks but keep the text descriptions and analysis
    text_only = markdown_content
    # Remove image embedding lines
    text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)
    # Remove image link lines
    text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)
    # Remove horizontal rules that were added for slide separation
    text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)
    # Clean up extra newlines
    text_only = re.sub(r'\n{3,}', '\n\n', text_only)
    # Apply final text cleaning to ensure plaintext
    text_only = clean_markdown_text(text_only)
    return text_only.strip()
 def send_to_api_and_get_haste_link(markdown_content, document_title):
    """Send markdown to API and get both raw markdown and HTML URLs"""
    try:
        print("Sending to API for URLs...")
        # Create text-only version for API
        text_only_markdown = create_text_only_markdown(markdown_content)
        # First, send raw markdown to haste.nixc.us
        raw_haste_url = None
        try:
            print("  📝 Creating raw markdown URL...")
            raw_response = requests.post(
                "https://haste.nixc.us/documents",
                data=text_only_markdown.encode('utf-8'),
                headers={"Content-Type": "text/plain"},
                timeout=30
            )
            if raw_response.status_code == 200:
                raw_token = raw_response.text.strip().strip('"')
                # Extract just the token from JSON response if needed
                if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
                    import json
                    try:
                        token_data = json.loads(raw_token)
                        raw_token = token_data['key']
                    except:
                        pass
                raw_haste_url = f"https://haste.nixc.us/{raw_token}"
                print(f"  ✅ Raw markdown URL created")
            else:
                print(f"  ⚠️  Raw markdown upload failed with status {raw_response.status_code}")
        except Exception as e:
            print(f"  ⚠️  Failed to create raw markdown URL: {e}")
        # Then, send to md.colinknapp.com for HTML version
        html_url = None
        try:
            print("  🎨 Creating HTML version URL...")
            api_data = {
                "markdown": text_only_markdown,
                "format": "html",
                "template": "playful",
                "title": f"Pitch Deck Analysis: {document_title}",
                "subtitle": "AI-Generated Analysis with Agentic Insights",
                "contact": "Generated by Pitch Deck Parser",
                "send_to_haste": True
            }
            response = requests.post(
                "https://md.colinknapp.com/api/convert",
                headers={"Content-Type": "application/json"},
                data=json.dumps(api_data),
                timeout=30
            )
            if response.status_code == 200:
                result = response.json()
                if 'haste_url' in result:
                    # Extract token from haste_url and format as requested
                    haste_url = result['haste_url']
                    if 'haste.nixc.us/' in haste_url:
                        token = haste_url.split('haste.nixc.us/')[-1]
                        html_url = f"https://md.colinknapp.com/haste/{token}"
                    else:
                        html_url = haste_url
                    print(f"  ✅ HTML version URL created")
                else:
                    print("  ⚠️  API response missing haste_url")
            else:
                print(f"  ⚠️  HTML API request failed with status {response.status_code}")
        except Exception as e:
            print(f"  ⚠️  Failed to create HTML URL: {e}")
        return raw_haste_url, html_url
    except Exception as e:
        print(f"⚠️  Failed to send to API: {e}")
        return None, None
--- a/modules/market_cap_validator.py
+++ b/modules/market_cap_validator.py
@ -0,0 +1,235 @@
 #!/usr/bin/env python3
 """
 Market Cap Validator - Main Interface
 This module provides a simple interface to validate market cap claims
 from pitch deck slides using RAG search capabilities.
 """
 import os
 import json
 from typing import List, Dict, Any, Optional
 from .rag_agent import MarketCapRAGAgent
 from .validation_report import ValidationReportGenerator
 class MarketCapValidator:
    """
    Main interface for market cap validation using RAG search
    """
    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize the market cap validator
        Args:
            api_key: OpenRouter API key (if not provided, will use environment variable)
        """
        self.rag_agent = MarketCapRAGAgent(api_key)
        self.report_generator = ValidationReportGenerator()
    def validate_from_slides(self, slide_texts: List[Dict[str, Any]], 
                           save_report: bool = True) -> Dict[str, Any]:
        """
        Validate market cap claims from slide text exports
        Args:
            slide_texts: List of slide data with 'slide_number' and 'text' keys
            save_report: Whether to save the validation report to file
        Returns:
            Dictionary containing validation results and report
        """
        print("🔍 Starting market cap validation process...")
        # Extract and validate claims
        validation_results = self.rag_agent.validate_all_claims(slide_texts)
        # Generate report
        report = self.report_generator.generate_report(validation_results, slide_texts)
        # Save report if requested
        report_filename = None
        if save_report:
            report_filename = self.report_generator.save_report(report)
            print(f"📄 Validation report saved to: {report_filename}")
        # Prepare summary
        summary = self._generate_summary(validation_results)
        return {
            'validation_results': validation_results,
            'report': report,
            'report_filename': report_filename,
            'summary': summary
        }
    def validate_from_file(self, file_path: str, save_report: bool = True) -> Dict[str, Any]:
        """
        Validate market cap claims from a JSON file containing slide texts
        Args:
            file_path: Path to JSON file with slide data
            save_report: Whether to save the validation report to file
        Returns:
            Dictionary containing validation results and report
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                slide_texts = json.load(f)
            print(f"📁 Loaded slide data from: {file_path}")
            return self.validate_from_slides(slide_texts, save_report)
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {file_path}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON file: {e}")
    def validate_from_processed_folder(self, folder_path: str = "processed", 
                                    save_report: bool = True) -> Dict[str, Any]:
        """
        Validate market cap claims from processed slide files
        Args:
            folder_path: Path to folder containing processed slide files
            save_report: Whether to save the validation report to file
        Returns:
            Dictionary containing validation results and report
        """
        slide_texts = []
        # Look for JSON files in the processed folder
        if os.path.exists(folder_path):
            for filename in os.listdir(folder_path):
                if filename.endswith('.json'):
                    file_path = os.path.join(folder_path, filename)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            data = json.load(f)
                        # Handle different JSON structures
                        if isinstance(data, list):
                            slide_texts.extend(data)
                        elif isinstance(data, dict) and 'slides' in data:
                            slide_texts.extend(data['slides'])
                        elif isinstance(data, dict) and 'text' in data:
                            slide_texts.append(data)
                    except (json.JSONDecodeError, KeyError) as e:
                        print(f"⚠️ Skipping invalid file {filename}: {e}")
                        continue
        if not slide_texts:
            raise ValueError(f"No valid slide data found in {folder_path}")
        print(f"📁 Loaded {len(slide_texts)} slides from processed folder")
        return self.validate_from_slides(slide_texts, save_report)
    def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
        """Generate a summary of validation results"""
        total_claims = len(validation_results)
        accurate_claims = sum(1 for r in validation_results if r.is_accurate)
        inaccurate_claims = total_claims - accurate_claims
        return {
            'total_claims': total_claims,
            'accurate_claims': accurate_claims,
            'inaccurate_claims': inaccurate_claims,
            'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
            'claims_by_slide': self._group_claims_by_slide(validation_results)
        }
    def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
        """Group claims by slide number"""
        claims_by_slide = {}
        for result in validation_results:
            slide_num = result.claim.slide_number
            if slide_num not in claims_by_slide:
                claims_by_slide[slide_num] = []
            claims_by_slide[slide_num].append(result)
        return claims_by_slide
 def validate_market_caps(slide_texts: List[Dict[str, Any]], 
                       api_key: Optional[str] = None,
                       save_report: bool = True) -> Dict[str, Any]:
    """
    Convenience function to validate market cap claims
    Args:
        slide_texts: List of slide data with 'slide_number' and 'text' keys
        api_key: OpenRouter API key (optional)
        save_report: Whether to save the validation report to file
    Returns:
        Dictionary containing validation results and report
    """
    validator = MarketCapValidator(api_key)
    return validator.validate_from_slides(slide_texts, save_report)
 def validate_market_caps_from_file(file_path: str, 
                                 api_key: Optional[str] = None,
                                 save_report: bool = True) -> Dict[str, Any]:
    """
    Convenience function to validate market cap claims from a file
    Args:
        file_path: Path to JSON file with slide data
        api_key: OpenRouter API key (optional)
        save_report: Whether to save the validation report to file
    Returns:
        Dictionary containing validation results and report
    """
    validator = MarketCapValidator(api_key)
    return validator.validate_from_file(file_path, save_report)
 def validate_market_caps_from_processed(folder_path: str = "processed",
                                       api_key: Optional[str] = None,
                                       save_report: bool = True) -> Dict[str, Any]:
    """
    Convenience function to validate market cap claims from processed folder
    Args:
        folder_path: Path to folder containing processed slide files
        api_key: OpenRouter API key (optional)
        save_report: Whether to save the validation report to file
    Returns:
        Dictionary containing validation results and report
    """
    validator = MarketCapValidator(api_key)
    return validator.validate_from_processed_folder(folder_path, save_report)
 if __name__ == "__main__":
    # Example usage
    print("Market Cap Validator - RAG Agent")
    print("=================================")
    # Try to validate from processed folder
    try:
        results = validate_market_caps_from_processed()
        print(f"\n✅ Validation Complete!")
        print(f"📊 Summary:")
        print(f"   - Total Claims: {results['summary']['total_claims']}")
        print(f"   - Accurate: {results['summary']['accurate_claims']}")
        print(f"   - Inaccurate: {results['summary']['inaccurate_claims']}")
        print(f"   - Accuracy Rate: {results['summary']['accuracy_rate']:.1f}%")
        if results['report_filename']:
            print(f"📄 Report saved to: {results['report_filename']}")
    except Exception as e:
        print(f"❌ Error: {e}")
        print("\nUsage examples:")
        print("1. Place slide data JSON files in 'processed/' folder")
        print("2. Run: python -m modules.market_cap_validator")
        print("3. Or use the functions directly in your code")
--- a/modules/pdf_processor.py
+++ b/modules/pdf_processor.py
@ -0,0 +1,60 @@
 #!/usr/bin/env python3
 import base64
 import fitz  # PyMuPDF for PDF processing
 from pathlib import Path
 def extract_slides_from_pdf(pdf_path, output_dir, document_name):
    """Extract individual slides from PDF as images"""
    print(f"Extracting slides from PDF: {pdf_path}")
    # Create processed directory structure: ./processed/DocumentName/
    processed_dir = Path("processed") / document_name
    processed_dir.mkdir(parents=True, exist_ok=True)
    # Create slides directory within processed directory
    slides_dir = processed_dir / "slides"
    slides_dir.mkdir(exist_ok=True)
    slides = []
    try:
        # Open PDF with PyMuPDF
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            # Convert page to image (high resolution)
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
            pix = page.get_pixmap(matrix=mat)
            # Save as PNG with document name prefix
            slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
            slide_path = slides_dir / slide_filename
            pix.save(str(slide_path))
            # Convert to base64 for API
            img_data = pix.tobytes("png")
            img_base64 = base64.b64encode(img_data).decode('utf-8')
            slides.append({
                'page_num': page_num + 1,
                'filename': slide_filename,
                'path': slide_path,
                'base64': img_base64,
                'document_name': document_name,
                'processed_dir': processed_dir
            })
            print(f"  Extracted slide {page_num + 1}")
        pdf_document.close()
        print(f"✅ Extracted {len(slides)} slides")
        return slides
    except Exception as e:
        print(f"❌ Error extracting slides: {e}")
        return []
--- a/modules/rag_agent.py
+++ b/modules/rag_agent.py
@ -0,0 +1,286 @@
 #!/usr/bin/env python3
 import re
 import json
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 from .client import get_openrouter_client
@dataclass
 class MarketCapClaim:
    """Represents a market cap claim found in slide text"""
    slide_number: int
    company_name: str
    claimed_market_cap: str
    raw_text: str
    confidence: float
@dataclass
 class ValidationResult:
    """Represents the validation result for a market cap claim"""
    claim: MarketCapClaim
    validated_market_cap: Optional[str]
    validation_source: str
    confidence_score: float
    is_accurate: bool
    discrepancy: Optional[str]
    rag_search_query: str
    rag_response: str
 class MarketCapRAGAgent:
    """
    RAG Agent for validating market cap claims from pitch deck slides
    using OpenRouter's web search capabilities
    """
    def __init__(self, api_key: Optional[str] = None):
        self.client = get_openrouter_client()
        self.market_cap_patterns = [
            r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)',
            r'valuation\s*:?\s*\$?([0-9,.]+[BMK]?)',
            r'worth\s*:?\s*\$?([0-9,.]+[BMK]?)',
            r'valued\s+at\s*:?\s*\$?([0-9,.]+[BMK]?)',
            r'\$([0-9,.]+[BMK]?)\s+(?:market\s+cap|valuation)',
            r'(?:market\s+cap|valuation)\s+of\s+\$?([0-9,.]+[BMK]?)'
        ]
    def extract_market_cap_claims(self, slide_texts: List[Dict[str, Any]]) -> List[MarketCapClaim]:
        """
        Extract market cap claims from slide text exports
        Args:
            slide_texts: List of slide data with 'slide_number' and 'text' keys
        Returns:
            List of MarketCapClaim objects
        """
        claims = []
        for slide_data in slide_texts:
            slide_number = slide_data.get('slide_number', 0)
            text = slide_data.get('text', '')
            if not text:
                continue
            # Extract company name (usually in first few lines or title)
            company_name = self._extract_company_name(text)
            # Search for market cap patterns
            for pattern in self.market_cap_patterns:
                matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
                for match in matches:
                    claimed_value = match.group(1)
                    raw_text = match.group(0)
                    # Calculate confidence based on context
                    confidence = self._calculate_confidence(text, match.start(), match.end())
                    claim = MarketCapClaim(
                        slide_number=slide_number,
                        company_name=company_name,
                        claimed_market_cap=claimed_value,
                        raw_text=raw_text,
                        confidence=confidence
                    )
                    claims.append(claim)
        return claims
    def _extract_company_name(self, text: str) -> str:
        """Extract company name from slide text"""
        lines = text.split('\n')[:5]  # Check first 5 lines
        for line in lines:
            line = line.strip()
            if line and len(line) > 2 and len(line) < 100:
                # Skip common slide headers
                if not any(header in line.lower() for header in ['slide', 'page', 'agenda', 'overview']):
                    return line
        return "Unknown Company"
    def _calculate_confidence(self, text: str, start: int, end: int) -> float:
        """Calculate confidence score for a market cap claim"""
        confidence = 0.5  # Base confidence
        # Extract context around the match
        context_start = max(0, start - 50)
        context_end = min(len(text), end + 50)
        context = text[context_start:context_end].lower()
        # Increase confidence for specific indicators
        if any(indicator in context for indicator in ['current', 'latest', 'as of', '2024', '2025']):
            confidence += 0.2
        if any(indicator in context for indicator in ['billion', 'million', 'trillion']):
            confidence += 0.1
        if 'market cap' in context or 'valuation' in context:
            confidence += 0.2
        return min(confidence, 1.0)
    def validate_claim_with_rag(self, claim: MarketCapClaim) -> ValidationResult:
        """
        Validate a market cap claim using RAG search
        Args:
            claim: MarketCapClaim to validate
        Returns:
            ValidationResult with validation details
        """
        # Construct RAG search query
        search_query = f"{claim.company_name} current market cap valuation 2024 2025"
        try:
            # Use OpenRouter with online search enabled
            response = self.client.chat.completions.create(
                model="mistralai/mistral-small",
                messages=[
                    {
                        "role": "user",
                        "content": f"""
                        Please search for the current market cap or valuation of {claim.company_name}.
                        The company claims their market cap is ${claim.claimed_market_cap}.
                        Please provide:
                        1. The current market cap/valuation if found
                        2. The source of this information
                        3. Whether the claimed value appears accurate
                        4. Any significant discrepancies
                        Focus on recent data from 2024-2025.
                        """
                    }
                ],
                max_tokens=800
            )
            rag_response = response.choices[0].message.content.strip()
            # Parse the response to extract validation details
            validation_details = self._parse_rag_response(rag_response, claim)
            return ValidationResult(
                claim=claim,
                validated_market_cap=validation_details.get('validated_cap'),
                validation_source=validation_details.get('source', 'RAG Search'),
                confidence_score=validation_details.get('confidence', 0.5),
                is_accurate=validation_details.get('is_accurate', False),
                discrepancy=validation_details.get('discrepancy'),
                rag_search_query=search_query,
                rag_response=rag_response
            )
        except Exception as e:
            return ValidationResult(
                claim=claim,
                validated_market_cap=None,
                validation_source="Error",
                confidence_score=0.0,
                is_accurate=False,
                discrepancy=f"RAG search failed: {str(e)}",
                rag_search_query=search_query,
                rag_response=f"Error: {str(e)}"
            )
    def _parse_rag_response(self, response: str, claim: MarketCapClaim) -> Dict[str, Any]:
        """Parse RAG response to extract validation details"""
        details = {
            'validated_cap': None,
            'source': 'RAG Search',
            'confidence': 0.5,
            'is_accurate': False,
            'discrepancy': None
        }
        response_lower = response.lower()
        # Look for market cap values in the response
        cap_patterns = [
            r'\$([0-9,.]+[BMK]?)',
            r'([0-9,.]+[BMK]?)\s+(?:billion|million|trillion)',
            r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)'
        ]
        for pattern in cap_patterns:
            matches = re.findall(pattern, response_lower)
            if matches:
                details['validated_cap'] = matches[0]
                break
        # Determine accuracy
        if details['validated_cap']:
            claimed_normalized = self._normalize_value(claim.claimed_market_cap)
            validated_normalized = self._normalize_value(details['validated_cap'])
            if claimed_normalized and validated_normalized:
                # Allow for some variance (within 20%)
                ratio = min(claimed_normalized, validated_normalized) / max(claimed_normalized, validated_normalized)
                details['is_accurate'] = ratio > 0.8
                if not details['is_accurate']:
                    details['discrepancy'] = f"Claimed: ${claim.claimed_market_cap}, Found: ${details['validated_cap']}"
        # Extract source information
        if 'source:' in response_lower or 'according to' in response_lower:
            source_match = re.search(r'(?:source:|according to)\s*([^\n]+)', response_lower)
            if source_match:
                details['source'] = source_match.group(1).strip()
        return details
    def _normalize_value(self, value: str) -> Optional[float]:
        """Normalize market cap value to a comparable number"""
        if not value:
            return None
        value = value.replace(',', '').upper()
        multiplier = 1
        if value.endswith('B'):
            multiplier = 1_000_000_000
            value = value[:-1]
        elif value.endswith('M'):
            multiplier = 1_000_000
            value = value[:-1]
        elif value.endswith('K'):
            multiplier = 1_000
            value = value[:-1]
        elif value.endswith('T'):
            multiplier = 1_000_000_000_000
            value = value[:-1]
        try:
            return float(value) * multiplier
        except ValueError:
            return None
    def validate_all_claims(self, slide_texts: List[Dict[str, Any]]) -> List[ValidationResult]:
        """
        Extract and validate all market cap claims from slide texts
        Args:
            slide_texts: List of slide data with 'slide_number' and 'text' keys
        Returns:
            List of ValidationResult objects
        """
        claims = self.extract_market_cap_claims(slide_texts)
        results = []
        print(f"Found {len(claims)} market cap claims to validate...")
        for i, claim in enumerate(claims, 1):
            print(f"  Validating claim {i}/{len(claims)}: {claim.company_name} - ${claim.claimed_market_cap}")
            result = self.validate_claim_with_rag(claim)
            results.append(result)
        return results
--- a/modules/requirements.txt
+++ b/modules/requirements.txt
@ -0,0 +1,6 @@
 pdf2image
 openai
 requests
 PyMuPDF
 docling
 python-dotenv
--- a/modules/validate_market_caps.py
+++ b/modules/validate_market_caps.py
@ -0,0 +1,129 @@
 #!/usr/bin/env python3
 """
 Clean Market Cap Validation CLI
 Validates market cap claims from pitch deck slides using RAG search.
 Reports are automatically organized in the processed/ directory.
 """
 import sys
 import os
 import argparse
 from modules.document_validator import (
    validate_document_claims,
    validate_all_processed_documents
 )
 def main():
    parser = argparse.ArgumentParser(
        description="Validate market cap claims from pitch deck slides using RAG search"
    )
    parser.add_argument(
        '--file', '-f',
        help='Path to JSON file containing slide data'
    )
    parser.add_argument(
        '--document', '-d',
        help='Document name for organized reporting'
    )
    parser.add_argument(
        '--all',
        action='store_true',
        help='Validate all documents in processed/ folder'
    )
    parser.add_argument(
        '--no-save',
        action='store_true',
        help='Do not save validation report to file'
    )
    parser.add_argument(
        '--api-key',
        help='OpenRouter API key (or set OPENROUTER_API_KEY environment variable)'
    )
    args = parser.parse_args()
    # Get API key
    api_key = args.api_key or os.getenv('OPENROUTER_API_KEY')
    if not api_key:
        print("❌ Error: OpenRouter API key required")
        print("   Set OPENROUTER_API_KEY environment variable or use --api-key")
        sys.exit(1)
    try:
        print("🔍 Market Cap Validation with RAG Search")
        print("=========================================")
        if args.all:
            print("📁 Validating all documents in processed/ folder")
            results = validate_all_processed_documents(api_key=api_key)
            print(f"\n✅ Validation Complete!")
            print(f"📊 Processed {len(results)} documents:")
            for doc_name, doc_results in results.items():
                if 'error' in doc_results:
                    print(f"   ❌ {doc_name}: {doc_results['error']}")
                else:
                    summary = doc_results['summary']
                    print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
                    if doc_results['report_filename']:
                        print(f"      📄 Report: {doc_results['report_filename']}")
        elif args.file:
            document_name = args.document or "Unknown-Document"
            print(f"📁 Validating from file: {args.file}")
            import json
            with open(args.file, 'r', encoding='utf-8') as f:
                slide_data = json.load(f)
            results = validate_document_claims(
                document_name, 
                slide_data,
                api_key=api_key,
                save_report=not args.no_save
            )
            # Display results
            summary = results['summary']
            print(f"\n✅ Validation Complete!")
            print(f"📊 Results Summary:")
            print(f"   - Total Claims Found: {summary['total_claims']}")
            print(f"   - Accurate Claims: {summary['accurate_claims']}")
            print(f"   - Inaccurate Claims: {summary['inaccurate_claims']}")
            print(f"   - Accuracy Rate: {summary['accuracy_rate']:.1f}%")
            if results['report_filename']:
                print(f"📄 Detailed report saved to: {results['report_filename']}")
        else:
            print("📁 Validating all documents in processed/ folder (default)")
            results = validate_all_processed_documents(api_key=api_key)
            print(f"\n✅ Validation Complete!")
            print(f"📊 Processed {len(results)} documents:")
            for doc_name, doc_results in results.items():
                if 'error' in doc_results:
                    print(f"   ❌ {doc_name}: {doc_results['error']}")
                else:
                    summary = doc_results['summary']
                    print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
                    if doc_results['report_filename']:
                        print(f"      📄 Report: {doc_results['report_filename']}")
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/modules/validation_report.py
+++ b/modules/validation_report.py
@ -0,0 +1,233 @@
 #!/usr/bin/env python3
 from typing import List, Dict, Any
 from datetime import datetime
 import os
 from .rag_agent import ValidationResult, MarketCapClaim
 class ValidationReportGenerator:
    """
    Generates comprehensive validation reports for market cap claims
    with slide source tracking
    """
    def __init__(self):
        self.report_sections = []
    def generate_report(self, validation_results: List[ValidationResult], 
                       slide_texts: List[Dict[str, Any]]) -> str:
        """
        Generate a comprehensive validation report
        Args:
            validation_results: List of ValidationResult objects
            slide_texts: Original slide text data for context
        Returns:
            Formatted markdown report string
        """
        report = []
        # Header
        report.append(self._generate_header())
        # Executive Summary
        report.append(self._generate_executive_summary(validation_results))
        # Detailed Results
        report.append(self._generate_detailed_results(validation_results))
        # Slide Source Analysis
        report.append(self._generate_slide_source_analysis(validation_results, slide_texts))
        # RAG Search Details
        report.append(self._generate_rag_search_details(validation_results))
        # Recommendations
        report.append(self._generate_recommendations(validation_results))
        return '\n\n'.join(report)
    def _generate_header(self) -> str:
        """Generate report header"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return f"""# Market Cap Validation Report
 **Generated:** {timestamp}  
 **Report Type:** RAG-Enhanced Validation Analysis  
 **Validation Method:** OpenRouter Web Search Integration
 ---
 """
    def _generate_executive_summary(self, results: List[ValidationResult]) -> str:
        """Generate executive summary section"""
        total_claims = len(results)
        accurate_claims = sum(1 for r in results if r.is_accurate)
        inaccurate_claims = total_claims - accurate_claims
        high_confidence = sum(1 for r in results if r.confidence_score > 0.7)
        accuracy_rate = (accurate_claims / total_claims * 100) if total_claims > 0 else 0
        return f"""## Executive Summary
 ### Key Metrics
 - **Total Market Cap Claims Analyzed:** {total_claims}
 - **Claims Validated as Accurate:** {accurate_claims} ({accuracy_rate:.1f}%)
 - **Claims with Discrepancies:** {inaccurate_claims}
 - **High Confidence Validations:** {high_confidence}
 ### Overall Assessment
 {'✅ **GOOD** - Most claims appear accurate' if accuracy_rate > 70 else '⚠️ **CAUTION** - Significant discrepancies found' if accuracy_rate < 50 else '🔍 **MIXED** - Some claims require verification'}
 ---
 """
    def _generate_detailed_results(self, results: List[ValidationResult]) -> str:
        """Generate detailed validation results"""
        if not results:
            return "## Detailed Results\n\nNo market cap claims found in the analyzed slides.\n\n---"
        report = ["## Detailed Validation Results\n"]
        for i, result in enumerate(results, 1):
            status_icon = "✅" if result.is_accurate else "❌" if result.discrepancy else "⚠️"
            confidence_bar = self._generate_confidence_bar(result.confidence_score)
            report.append(f"""### {status_icon} Claim #{i}: {result.claim.company_name}
 **Slide Source:** Slide {result.claim.slide_number}  
 **Claimed Market Cap:** ${result.claim.claimed_market_cap}  
 **Raw Text:** `{result.claim.raw_text}`  
 **Confidence Score:** {confidence_bar} ({result.confidence_score:.2f})
 **Validation Results:**
 - **Validated Market Cap:** {result.validated_market_cap or 'Not found'}
 - **Validation Source:** {result.validation_source}
 - **Accuracy Status:** {'✅ Accurate' if result.is_accurate else '❌ Inaccurate' if result.discrepancy else '⚠️ Uncertain'}
 """)
            if result.discrepancy:
                report.append(f"- **Discrepancy:** {result.discrepancy}")
            report.append(f"- **RAG Search Query:** `{result.rag_search_query}`")
            report.append("")
        report.append("---")
        return '\n'.join(report)
    def _generate_slide_source_analysis(self, results: List[ValidationResult], 
                                      slide_texts: List[Dict[str, Any]]) -> str:
        """Generate slide source analysis section"""
        report = ["## Slide Source Analysis\n"]
        # Group results by slide
        slide_claims = {}
        for result in results:
            slide_num = result.claim.slide_number
            if slide_num not in slide_claims:
                slide_claims[slide_num] = []
            slide_claims[slide_num].append(result)
        # Find slide texts
        slide_text_map = {s.get('slide_number', 0): s.get('text', '') for s in slide_texts}
        for slide_num in sorted(slide_claims.keys()):
            claims = slide_claims[slide_num]
            slide_text = slide_text_map.get(slide_num, 'No text available')
            report.append(f"""### Slide {slide_num} Analysis
 **Claims Found:** {len(claims)}  
 **Slide Text Preview:** {slide_text[:200]}{'...' if len(slide_text) > 200 else ''}
 **Claims Details:**""")
            for claim in claims:
                status = "✅ Accurate" if any(r.claim == claim and r.is_accurate for r in results) else "❌ Inaccurate"
                report.append(f"- {claim.company_name}: ${claim.claimed_market_cap} - {status}")
            report.append("")
        report.append("---")
        return '\n'.join(report)
    def _generate_rag_search_details(self, results: List[ValidationResult]) -> str:
        """Generate RAG search details section"""
        report = ["## RAG Search Details\n"]
        report.append("### Search Methodology")
        report.append("- **Search Engine:** OpenRouter with Exa integration")
        report.append("- **Model:** Mistral Small with online search enabled")
        report.append("- **Search Focus:** Current market cap data (2024-2025)")
        report.append("- **Validation Threshold:** 80% accuracy tolerance")
        report.append("")
        report.append("### Search Queries Used")
        unique_queries = list(set(r.rag_search_query for r in results))
        for i, query in enumerate(unique_queries, 1):
            report.append(f"{i}. `{query}`")
        report.append("")
        report.append("### Sample RAG Responses")
        for i, result in enumerate(results[:3], 1):  # Show first 3 responses
            report.append(f"""#### Response #{i}: {result.claim.company_name}
 ```
 {result.rag_response[:300]}{'...' if len(result.rag_response) > 300 else ''}
 ```""")
        report.append("---")
        return '\n'.join(report)
    def _generate_recommendations(self, results: List[ValidationResult]) -> str:
        """Generate recommendations section"""
        inaccurate_results = [r for r in results if not r.is_accurate and r.discrepancy]
        high_confidence_results = [r for r in results if r.confidence_score > 0.7]
        report = ["## Recommendations\n"]
        if inaccurate_results:
            report.append("### ⚠️ Claims Requiring Attention")
            for result in inaccurate_results:
                report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - {result.discrepancy}")
            report.append("")
        if high_confidence_results:
            report.append("### ✅ High Confidence Validations")
            report.append("The following claims were validated with high confidence:")
            for result in high_confidence_results:
                report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - ${result.claim.claimed_market_cap}")
            report.append("")
        report.append("### 📋 General Recommendations")
        report.append("1. **Verify Discrepancies:** Review claims marked as inaccurate with stakeholders")
        report.append("2. **Update Sources:** Consider updating slide sources with more recent data")
        report.append("3. **Regular Validation:** Implement periodic validation of financial claims")
        report.append("4. **Source Attribution:** Always include data sources and dates in financial slides")
        report.append("\n---")
        report.append("*Report generated by Market Cap RAG Validation Agent*")
        return '\n'.join(report)
    def _generate_confidence_bar(self, confidence: float) -> str:
        """Generate a visual confidence bar"""
        filled = int(confidence * 10)
        empty = 10 - filled
        return f"[{'█' * filled}{'░' * empty}]"
    def save_report(self, report: str, filename: str = None, processed_dir: str = "processed") -> str:
        """Save report to file"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"market_cap_validation_report_{timestamp}.md"
        # Create processed directory if it doesn't exist
        os.makedirs(processed_dir, exist_ok=True)
        filepath = os.path.join(processed_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(report)
        return filepath
--- a/modules/working_app.py
+++ b/modules/working_app.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python3
 import sys
 import os
 from pathlib import Path
 def process_pitch_deck(pdf_path):
    """Working version that bypasses the signature mess"""
    print(f"Processing: {pdf_path}")
    # Import everything we need
    from client import get_openrouter_client
    from pdf_processor import extract_slides_from_pdf
    from analysis import analyze_slides_batch
    # Extract slides (this works)
    slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
    print(f"Extracted {len(slides)} slides")
    # Analyze slides (this works)
    client = get_openrouter_client()
    analysis_results = analyze_slides_batch(client, slides)
    print("Analysis complete")
    # Create report manually (bypass the broken create_slide_markdown)
    markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
    for i, slide_data in enumerate(slides):
        slide_num = i + 1
        analysis = analysis_results.get(slide_num, {})
        markdown_content += f"## Slide {slide_num}\n\n"
        markdown_content += f"![Slide {slide_num}](slides/{slide_data['filename']})\n\n"
        if analysis:
            markdown_content += f"**Analysis:**\n{analysis}\n\n"
        else:
            markdown_content += "**Analysis:** No analysis available\n\n"
        markdown_content += "---\n\n"
    # Save report
    output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
    os.makedirs("processed", exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(markdown_content)
    print(f"Report saved to: {output_file}")
    return output_file
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python working_app.py <pdf_path>")
        sys.exit(1)
    pdf_path = sys.argv[1]
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found")
        sys.exit(1)
    process_pitch_deck(pdf_path)
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_001.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_001.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_002.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_002.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_003.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_003.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_004.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_004.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_005.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_005.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_006.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_006.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_007.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_007.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_008.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_008.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_009.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_009.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_010.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_010.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_011.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_011.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_012.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_012.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_013.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_013.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_014.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_014.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_015.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_015.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_016.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_016.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_017.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_017.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_018.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_018.png
--- a/processed/AirBnB_Pitch_Deck_analysis.md
+++ b/processed/AirBnB_Pitch_Deck_analysis.md
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
 pdf2image
 openai
 requests
 PyMuPDF
 docling
 python-dotenv
--- a/start.sh
+++ b/start.sh
@ -0,0 +1,58 @@
 #!/bin/bash
 # Kill any process running on port 3123
 echo "Killing any existing processes on port 3123..."
 fuser -k 3123/tcp 2>/dev/null || true
 # Create virtual environment if it doesn't exist
 if [ ! -d "venv" ]; then
    echo "Creating virtual environment..."
    python3 -m venv venv
 fi
 # Activate virtual environment
 echo "Activating virtual environment..."
 source venv/bin/activate
 # Verify virtual environment is active
 echo "Verifying virtual environment..."
 which python3
 python3 --version
 # Install dependencies
 echo "Installing dependencies..."
 pip install -r requirements.txt
 # Check for help flag
 if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
    echo ""
    echo "Pitch Deck Analysis Application"
    echo "=============================="
    echo "Usage: ./start.sh <file_path>"
    echo "Example: ./start.sh presentation.pdf"
    echo ""
    echo "The application will automatically upload the generated report."
    echo ""
    exit 0
 fi
 # Verify file exists
 if [ -z "$1" ]; then
    echo "Error: No file specified"
    echo "Usage: ./start.sh <file_path>"
    exit 1
 fi
 if [ ! -f "$1" ]; then
    echo "Error: File '$1' not found"
    exit 1
 fi
 # Start the application with immediate feedback
 echo "Starting pitch deck parser..."
 echo "Processing file: $1"
 echo "Python path: $(which python3)"
 echo "Working directory: $(pwd)"
 echo "----------------------------------------"
 python3 app.py "$1"