Initial commit: Technical screen project with document analysis capabilities

2025-10-22 18:55:39 -04:00 · 2025-10-22 18:55:39 -04:00 · 0bb86c677d
commit 0bb86c677d
40 changed files with 5857 additions and 0 deletions
--- a/.cursor/rules/code-cleanup.mdc
+++ b/.cursor/rules/code-cleanup.mdc
@ -0,0 +1,5 @@
+---
+alwaysApply: true
+---
+# Code Cleanup Guidelines
+Remove unused code, imports, and dead functions to keep the codebase clean and maintainable. Regular cleanup prevents technical debt and improves code readability.
--- a/.cursor/rules/code-length.mdc
+++ b/.cursor/rules/code-length.mdc
@ -0,0 +1,5 @@
+---
+alwaysApply: true
+---
+# Code Length Guidelines
+Keep all code files under 300 lines for better maintainability and readability. If a file exceeds this limit, consider breaking it into smaller, focused modules.
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,55 @@
+# Environment variables
+.env
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+# Logs
+*.log
+
+# Temporary files
+*.tmp
+*.temp
--- a/app.py
+++ b/app.py
@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import re
+from pathlib import Path
+
+def generate_toc(markdown_content):
+    """Generate a Table of Contents from markdown headers"""
+    print("  📋 Generating Table of Contents...")
+    lines = markdown_content.split('\n')
+    toc_lines = []
+    toc_lines.append("## Table of Contents")
+    toc_lines.append("")
+    
+    header_count = 0
+    for line in lines:
+        # Match headers (##, ###, etc.)
+        header_match = re.match(r'^(#{2,})\s+(.+)$', line)
+        if header_match:
+            header_count += 1
+            level = len(header_match.group(1)) - 2  # Convert ## to 0, ### to 1, etc.
+            title = header_match.group(2)
+            
+            # Create anchor link
+            anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
+            anchor = re.sub(r'\s+', '-', anchor.strip())
+            
+            # Add indentation based on header level
+            indent = "  " * level
+            toc_lines.append(f"{indent}- [{title}](#{anchor})")
+    
+    toc_lines.append("")
+    toc_lines.append("---")
+    toc_lines.append("")
+    
+    print(f"  ✅ Generated TOC with {header_count} headers")
+    return '\n'.join(toc_lines)
+
+def main():
+    """Simple pitch deck analyzer"""
+    if len(sys.argv) < 2:
+        print("Usage: python app.py <pdf_file>")
+        return
+    
+    pdf_path = sys.argv[1]
+    if not os.path.exists(pdf_path):
+        print(f"Error: File '{pdf_path}' not found")
+        return
+    
+    print(f"🚀 Processing: {pdf_path}")
+    
+    # Import what we need directly (avoid __init__.py issues)
+    print("📦 Importing modules...")
+    sys.path.append('modules')
+    from client import get_openrouter_client
+    from pdf_processor import extract_slides_from_pdf
+    from analysis import analyze_slides_batch
+    from markdown_utils import send_to_api_and_get_haste_link
+    print("✅ Modules imported successfully")
+    
+    # Extract slides
+    print("📄 Extracting slides...")
+    slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
+    print(f"✅ Extracted {len(slides)} slides")
+    
+    # Analyze slides
+    print("🧠 Analyzing slides...")
+    client = get_openrouter_client()
+    print("🔗 API client initialized")
+    
+    analysis_results = analyze_slides_batch(client, slides)
+    print("✅ Analysis complete")
+    
+    # Create report
+    print("📝 Creating report...")
+    markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
+    
+    # Add analysis metadata
+    markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
+    markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
+    markdown_content += f"**Analysis Generated:** {len(slides)} slides processed\n"
+    markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
+    markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"
+    
+    print(f"📊 Building markdown for {len(slides)} slides...")
+    for i, slide_data in enumerate(slides):
+        slide_num = i + 1
+        analysis = analysis_results.get(slide_num, {})
+        
+        print(f"  📄 Processing slide {slide_num}...")
+        
+        markdown_content += f"# Slide {slide_num}\n\n"
+        markdown_content += f"![Slide {slide_num}](slides/{slide_data['filename']})\n\n"
+        
+        if analysis:
+            markdown_content += "## Agentic Analysis\n\n"
+            
+            # Format each agent's analysis
+            agent_count = 0
+            for agent_key, agent_data in analysis.items():
+                if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
+                    agent_count += 1
+                    agent_name = agent_data['agent']
+                    agent_analysis = agent_data['analysis']
+                    
+                    markdown_content += f"### {agent_name}\n\n"
+                    markdown_content += f"{agent_analysis}\n\n"
+            
+            print(f"    ✅ Added {agent_count} agent analyses")
+        else:
+            markdown_content += "## Agentic Analysis\n\n"
+            markdown_content += "No analysis available\n\n"
+            print(f"    ⚠️  No analysis available for slide {slide_num}")
+        
+        markdown_content += "---\n\n"
+    
+    # Generate Table of Contents
+    print("📋 Generating Table of Contents...")
+    toc = generate_toc(markdown_content)
+    
+    # Insert TOC after the main title
+    print("🔗 Inserting TOC into document...")
+    lines = markdown_content.split('\n')
+    final_content = []
+    final_content.append(lines[0])  # Main title
+    final_content.append("")  # Empty line
+    final_content.append(toc)  # TOC
+    final_content.extend(lines[2:])  # Rest of content
+    
+    final_markdown = '\n'.join(final_content)
+    
+    # Save report
+    output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
+    print(f"💾 Saving report to: {output_file}")
+    os.makedirs("processed", exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(final_markdown)
+    
+    print(f"✅ Report saved successfully ({len(final_markdown)} characters)")
+    
+    # Always upload the report
+    print("🌐 Uploading report...")
+    haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
+    if haste_url:
+        print(f"✅ Report uploaded to: {haste_url}")
+    else:
+        print("❌ Upload failed")
+
+if __name__ == "__main__":
+    main()
--- a/example.env
+++ b/example.env
@ -0,0 +1,5 @@
+# OpenRouter API Configuration
+OPENROUTER_API_KEY=your_openrouter_api_key_here
+
+# Optional: Custom OpenAI model (defaults to gpt-3.5-turbo)
+# OPENROUTER_MODEL=openai/gpt-3.5-turbo
--- a/modules/init.py
+++ b/modules/init.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+# Pitch Deck Parser Modules
+# This package contains all the modular components for the pitch deck analysis application
+
+from .client import get_openrouter_client
+from .file_utils import detect_file_type, convert_to_pdf, convert_with_libreoffice
+from .pdf_processor import extract_slides_from_pdf
+from .docling_processor import extract_text_with_docling, get_slide_text_content
+from .analysis import (
+    analyze_slide_with_single_prompt,
+    analyze_slides_batch,
+    analyze_slide_with_agentic_prompts_parallel,
+    process_single_slide_parallel
+)
+from .markdown_utils import (
+    create_slide_markdown,
+    create_text_only_markdown,
+    send_to_api_and_get_haste_link
+)
+
+__all__ = [
+    'get_openrouter_client',
+    'detect_file_type',
+    'convert_to_pdf',
+    'convert_with_libreoffice',
+    'extract_slides_from_pdf',
+    'extract_text_with_docling',
+    'get_slide_text_content',
+    'analyze_slide_with_single_prompt',
+    'analyze_slides_batch',
+    'analyze_slide_with_agentic_prompts_parallel',
+    'process_single_slide_parallel',
+    'create_slide_markdown',
+    'create_text_only_markdown',
+    'send_to_api_and_get_haste_link'
+]
+
+# Market Cap RAG Validation
+from .rag_agent import MarketCapRAGAgent, MarketCapClaim, ValidationResult
+from .validation_report import ValidationReportGenerator
+from .market_cap_validator import (
+    MarketCapValidator,
+    validate_market_caps,
+    validate_market_caps_from_file,
+    validate_market_caps_from_processed
+)
+
+# Update __all__ list
+__all__.extend([
+    'MarketCapRAGAgent',
+    'MarketCapClaim', 
+    'ValidationResult',
+    'ValidationReportGenerator',
+    'MarketCapValidator',
+    'validate_market_caps',
+    'validate_market_caps_from_file',
+    'validate_market_caps_from_processed'
+])
+
+# Document-specific validation
+from .document_validator import (
+    DocumentValidator,
+    validate_document_claims,
+    validate_all_processed_documents
+)
+
+# Update __all__ list
+__all__.extend([
+    'DocumentValidator',
+    'validate_document_claims',
+    'validate_all_processed_documents'
+])
+
+# Main application and CLI tools
+from .app import *
+from .example_usage import *
+from .validate_market_caps import *
+
+# Update __all__ list
+__all__.extend([
+    'app',
+    'example_usage', 
+    'validate_market_caps'
+])
--- a/modules/analysis.py
+++ b/modules/analysis.py
@ -0,0 +1,90 @@
+import re
+from client import get_openrouter_client
+
+def analyze_slides_batch(client, slides_data, batch_size=1):
+    """Process slides individually with specialized AI agents"""
+    print(f"  Processing {len(slides_data)} slides individually...")
+    
+    all_results = {}
+    
+    for i, slide_data in enumerate(slides_data):
+        slide_num = slide_data["page_num"]
+        print(f"    🔍 Analyzing slide {slide_num} ({i+1}/{len(slides_data)})...")
+        
+        # Define specialized agents
+        agents = {
+            'content_extractor': {
+                'name': 'Content Extractor',
+                'prompt': 'Extract and summarize the key textual content from this slide. Focus on headlines, bullet points, and main messages.'
+            },
+            'visual_analyzer': {
+                'name': 'Visual Analyzer', 
+                'prompt': 'Analyze the visual design elements of this slide. Comment on layout, colors, typography, and visual hierarchy.'
+            },
+            'data_interpreter': {
+                'name': 'Data Interpreter',
+                'prompt': 'Identify and interpret any numerical data, charts, graphs, or metrics present on this slide.'
+            },
+            'message_evaluator': {
+                'name': 'Message Evaluator',
+                'prompt': 'Evaluate the effectiveness of the message delivery and communication strategy on this slide.'
+            },
+            'improvement_suggestor': {
+                'name': 'Improvement Suggestor',
+                'prompt': 'Suggest specific improvements for this slide in terms of clarity, impact, and effectiveness.'
+            }
+        }
+        
+        slide_analysis = {}
+        
+        # Analyze with each specialized agent
+        for j, (agent_key, agent_config) in enumerate(agents.items()):
+            print(f"      🤖 Running {agent_config['name']} ({j+1}/5)...")
+            
+            messages = [
+                {
+                    "role": "system",
+                    "content": f"You are a {agent_config['name']} specialized in analyzing pitch deck slides. {agent_config['prompt']}"
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": f"Analyze slide {slide_num}:"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{slide_data['base64']}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            try:
+                print(f"        📡 Sending API request...")
+                response = client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=messages,
+                    max_tokens=500
+                )
+                
+                analysis = response.choices[0].message.content.strip()
+                print(f"        ✅ {agent_config['name']} completed ({len(analysis)} chars)")
+                
+                slide_analysis[agent_key] = {
+                    'agent': agent_config['name'],
+                    'analysis': analysis
+                }
+                
+            except Exception as e:
+                print(f"        ❌ {agent_config['name']} failed: {str(e)}")
+                slide_analysis[agent_key] = {
+                    'agent': agent_config['name'],
+                    'analysis': f"Error analyzing slide {slide_num}: {str(e)}"
+                }
+        
+        all_results[slide_num] = slide_analysis
+        print(f"    ✅ Slide {slide_num} analysis complete")
+    
+    print(f"  🎉 All {len(slides_data)} slides analyzed successfully!")
+    return all_results
--- a/modules/client.py
+++ b/modules/client.py
@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+from openai import OpenAI
+from dotenv import load_dotenv
+
+
+def get_openrouter_client():
+    """Initialize OpenRouter client with API key from .env file"""
+    # Load .env file
+    load_dotenv()
+    
+    api_key = os.getenv('OPENROUTER_API_KEY')
+    if not api_key or api_key == 'your_openrouter_api_key_here':
+        print("❌ Error: OPENROUTER_API_KEY not properly set in .env file")
+        print("Please update your .env file with a valid OpenRouter API key")
+        sys.exit(1)
+    
+    return OpenAI(
+        base_url="https://openrouter.ai/api/v1",
+        api_key=api_key
+    )
--- a/modules/docling_processor.py
+++ b/modules/docling_processor.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+from docling.document_converter import DocumentConverter
+from pathlib import Path
+import fitz  # PyMuPDF as fallback
+import re
+
+
+def clean_text(text):
+    """Clean text to ensure it's plaintext with no special characters or LaTeX"""
+    if not text:
+        return ""
+    
+    # Remove LaTeX commands and math expressions
+    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
+    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
+    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands
+    
+    # Remove special characters and normalize
+    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
+    
+    # Clean up multiple spaces and newlines
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\n\s*\n', '\n\n', text)
+    
+    return text.strip()
+
+
+def extract_text_with_docling(pdf_path, output_dir, document_name):
+    """Extract text content from PDF using Docling with PyMuPDF fallback"""
+    print(f"Extracting text content with Docling: {pdf_path}")
+    
+    try:
+        # Initialize Docling converter
+        converter = DocumentConverter()
+        # Configure OCR for better text extraction
+        converter.ocr_options.engine = "rapidocr"  # Use faster OCR engine
+        converter.ocr_options.do_ocr = True
+        converter.ocr_options.do_table_ocr = True
+        
+        # Convert PDF to text
+        result = converter.convert(pdf_path)
+        
+        # Get the text content
+        text_content = result.document.export_to_markdown()
+        
+        # Clean the text to ensure it's plaintext
+        text_content = clean_text(text_content)
+        
+        # Create processed directory structure if it doesn't exist
+        processed_dir = Path("processed") / document_name
+        processed_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Save the text content to a file
+        text_file = processed_dir / f"{document_name}_text_content.md"
+        with open(text_file, 'w', encoding='utf-8') as f:
+            f.write(text_content)
+        
+        print(f"✅ Text content extracted and saved to: {text_file}")
+        
+        return {
+            'text_content': text_content,
+            'text_file': text_file,
+            'processed_dir': processed_dir
+        }
+        
+    except Exception as e:
+        print(f"❌ Docling failed: {e}")
+        print("🔄 Trying PyMuPDF fallback...")
+        
+        # Fallback to PyMuPDF
+        try:
+            text_content = extract_text_with_pymupdf(pdf_path)
+            
+            if text_content:
+                # Clean the text to ensure it's plaintext
+                text_content = clean_text(text_content)
+                
+                # Create processed directory structure if it doesn't exist
+                processed_dir = Path("processed") / document_name
+                processed_dir.mkdir(parents=True, exist_ok=True)
+                
+                # Save the text content to a file
+                text_file = processed_dir / f"{document_name}_text_content.md"
+                with open(text_file, 'w', encoding='utf-8') as f:
+                    f.write(text_content)
+                
+                print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
+                
+                return {
+                    'text_content': text_content,
+                    'text_file': text_file,
+                    'processed_dir': processed_dir
+                }
+            else:
+                print("⚠️  PyMuPDF fallback also failed")
+                return None
+                
+        except Exception as fallback_error:
+            print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
+            return None
+
+
+def extract_text_with_pymupdf(pdf_path):
+    """Extract text using PyMuPDF as fallback with clean formatting"""
+    try:
+        doc = fitz.open(pdf_path)
+        text_content = ""
+        
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            
+            # Extract text with better formatting
+            page_text = page.get_text()
+            
+            # Clean the page text
+            page_text = clean_text(page_text)
+            
+            # Add page separator
+            text_content += f"\n--- Page {page_num + 1} ---\n"
+            text_content += page_text
+            text_content += "\n"
+        
+        doc.close()
+        return text_content
+        
+    except Exception as e:
+        print(f"PyMuPDF extraction failed: {e}")
+        return None
+
+
+def get_slide_text_content(text_content, slide_num):
+    """Extract text content for a specific slide from the full document text"""
+    try:
+        if not text_content:
+            return ""
+            
+        # Split by page separators
+        pages = text_content.split('--- Page')
+        
+        # Find the page for this slide
+        target_page = None
+        for page in pages:
+            if page.strip().startswith(f" {slide_num} ---"):
+                target_page = page
+                break
+        
+        if target_page:
+            # Remove the page header and clean up
+            lines = target_page.split('\n')[1:]  # Remove page header
+            slide_text = '\n'.join(lines).strip()
+            
+            # Further clean the slide text
+            slide_text = clean_text(slide_text)
+            
+            return slide_text
+        else:
+            # Fallback: try to extract from sections
+            sections = text_content.split('\n\n')
+            if slide_num <= len(sections):
+                return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
+            else:
+                # Return a portion of the text content
+                lines = text_content.split('\n')
+                start_line = (slide_num - 1) * 5  # Approximate 5 lines per slide
+                end_line = min(start_line + 10, len(lines))  # Up to 10 lines
+                slide_text = '\n'.join(lines[start_line:end_line])
+                return clean_text(slide_text)
+            
+    except Exception as e:
+        print(f"⚠️  Error extracting text for slide {slide_num}: {e}")
+        return f"[Text content for slide {slide_num} could not be extracted]"
--- a/modules/document_validator.py
+++ b/modules/document_validator.py
@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+"""
+Document-specific validator that organizes reports by document in processed directory
+"""
+
+import os
+import json
+from typing import List, Dict, Any, Optional
+from .rag_agent import MarketCapRAGAgent
+from .validation_report import ValidationReportGenerator
+
+
+class DocumentValidator:
+    """
+    Validates financial claims for specific documents with proper directory organization
+    """
+    
+    def __init__(self, api_key: Optional[str] = None):
+        self.rag_agent = MarketCapRAGAgent(api_key)
+        self.report_generator = ValidationReportGenerator()
+    
+    def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]], 
+                         save_report: bool = True) -> Dict[str, Any]:
+        """
+        Validate financial claims for a specific document
+        
+        Args:
+            document_name: Name of the document (e.g., "Uber-Pitch-Deck")
+            slide_texts: List of slide data with 'slide_number' and 'text' keys
+            save_report: Whether to save the validation report to file
+            
+        Returns:
+            Dictionary containing validation results and report
+        """
+        print(f"🔍 Validating financial claims for: {document_name}")
+        
+        # Extract and validate claims
+        validation_results = self.rag_agent.validate_all_claims(slide_texts)
+        
+        # Generate report
+        report = self.report_generator.generate_report(validation_results, slide_texts)
+        
+        # Save report in proper directory structure
+        report_filename = None
+        if save_report:
+            # Create document-specific directory
+            doc_dir = os.path.join("processed", document_name)
+            os.makedirs(doc_dir, exist_ok=True)
+            
+            # Save report in document directory
+            report_filename = self.report_generator.save_report(
+                report, 
+                f"{document_name}_market_cap_validation.md",
+                doc_dir
+            )
+            print(f"📄 Validation report saved to: {report_filename}")
+        
+        # Prepare summary
+        summary = self._generate_summary(validation_results)
+        
+        return {
+            'document_name': document_name,
+            'validation_results': validation_results,
+            'report': report,
+            'report_filename': report_filename,
+            'summary': summary
+        }
+    
+    def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
+        """
+        Validate all documents in the processed folder
+        
+        Args:
+            folder_path: Path to processed folder
+            
+        Returns:
+            Dictionary with results for each document
+        """
+        results = {}
+        
+        if not os.path.exists(folder_path):
+            raise ValueError(f"Processed folder not found: {folder_path}")
+        
+        # Find all document directories
+        for item in os.listdir(folder_path):
+            item_path = os.path.join(folder_path, item)
+            if os.path.isdir(item_path) and not item.startswith('.'):
+                # Look for text content files
+                text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
+                
+                if text_files:
+                    document_name = item
+                    text_file = os.path.join(item_path, text_files[0])
+                    
+                    print(f"📁 Processing document: {document_name}")
+                    
+                    # Read text content
+                    with open(text_file, 'r', encoding='utf-8') as f:
+                        content = f.read()
+                    
+                    # Convert to slide format
+                    slide_texts = [{
+                        "slide_number": 1,
+                        "text": content
+                    }]
+                    
+                    # Validate document
+                    try:
+                        doc_results = self.validate_document(document_name, slide_texts)
+                        results[document_name] = doc_results
+                    except Exception as e:
+                        print(f"❌ Error processing {document_name}: {e}")
+                        results[document_name] = {'error': str(e)}
+        
+        return results
+    
+    def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
+        """Generate a summary of validation results"""
+        total_claims = len(validation_results)
+        accurate_claims = sum(1 for r in validation_results if r.is_accurate)
+        inaccurate_claims = total_claims - accurate_claims
+        
+        return {
+            'total_claims': total_claims,
+            'accurate_claims': accurate_claims,
+            'inaccurate_claims': inaccurate_claims,
+            'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
+            'claims_by_slide': self._group_claims_by_slide(validation_results)
+        }
+    
+    def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
+        """Group claims by slide number"""
+        claims_by_slide = {}
+        for result in validation_results:
+            slide_num = result.claim.slide_number
+            if slide_num not in claims_by_slide:
+                claims_by_slide[slide_num] = []
+            claims_by_slide[slide_num].append(result)
+        return claims_by_slide
+
+
+def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]], 
+                           api_key: Optional[str] = None,
+                           save_report: bool = True) -> Dict[str, Any]:
+    """
+    Convenience function to validate claims for a specific document
+    
+    Args:
+        document_name: Name of the document
+        slide_texts: List of slide data
+        api_key: OpenRouter API key (optional)
+        save_report: Whether to save the validation report to file
+        
+    Returns:
+        Dictionary containing validation results and report
+    """
+    validator = DocumentValidator(api_key)
+    return validator.validate_document(document_name, slide_texts, save_report)
+
+
+def validate_all_processed_documents(folder_path: str = "processed",
+                                   api_key: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Convenience function to validate all documents in processed folder
+    
+    Args:
+        folder_path: Path to processed folder
+        api_key: OpenRouter API key (optional)
+        
+    Returns:
+        Dictionary with results for each document
+    """
+    validator = DocumentValidator(api_key)
+    return validator.validate_from_processed_folder(folder_path)
+
+
+if __name__ == "__main__":
+    # Example usage
+    print("Document Validator - RAG Agent")
+    print("===============================")
+    
+    try:
+        results = validate_all_processed_documents()
+        
+        print(f"\n✅ Validation Complete!")
+        print(f"📊 Processed {len(results)} documents:")
+        
+        for doc_name, doc_results in results.items():
+            if 'error' in doc_results:
+                print(f"   ❌ {doc_name}: {doc_results['error']}")
+            else:
+                summary = doc_results['summary']
+                print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
+                if doc_results['report_filename']:
+                    print(f"      📄 Report: {doc_results['report_filename']}")
+                    
+    except Exception as e:
+        print(f"❌ Error: {e}")
--- a/modules/file_utils.py
+++ b/modules/file_utils.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+import subprocess
+from pathlib import Path
+
+
+def detect_file_type(file_path):
+    """Detect file type based on extension"""
+    file_ext = Path(file_path).suffix.lower()
+    
+    file_types = {
+        '.pdf': 'pdf',
+        '.pptx': 'powerpoint',
+        '.ppt': 'powerpoint',
+        '.docx': 'word',
+        '.doc': 'word',
+        '.odp': 'openoffice_presentation',
+        '.odt': 'openoffice_document'
+    }
+    
+    return file_types.get(file_ext, 'unknown')
+
+
+def convert_to_pdf(input_file, output_dir, document_name):
+    """Convert various file types to PDF"""
+    file_type = detect_file_type(input_file)
+    
+    if file_type == 'pdf':
+        print("✅ File is already PDF, no conversion needed")
+        return input_file
+    
+    print(f"🔄 Converting {file_type} file to PDF...")
+    
+    # Create temporary PDF file
+    temp_pdf = output_dir + "/" + f"{document_name}_temp.pdf"
+    
+    try:
+        if file_type == 'powerpoint':
+            # Convert PowerPoint to PDF using pptxtopdf
+            print("  Using pptxtopdf for PowerPoint conversion...")
+            result = subprocess.run([
+                'python', '-c', 
+                f'import pptxtopdf; pptxtopdf.convert("{input_file}", "{temp_pdf}")'
+            ], capture_output=True, text=True, timeout=60)
+            
+            if result.returncode != 0:
+                print(f"⚠️  pptxtopdf failed: {result.stderr}")
+                # Fallback: try using LibreOffice
+                return convert_with_libreoffice(input_file, temp_pdf, file_type)
+            
+        elif file_type in ['word', 'openoffice_document']:
+            # Convert Word documents using LibreOffice
+            return convert_with_libreoffice(input_file, temp_pdf, file_type)
+            
+        elif file_type == 'openoffice_presentation':
+            # Convert OpenOffice presentations using LibreOffice
+            return convert_with_libreoffice(input_file, temp_pdf, file_type)
+            
+        else:
+            print(f"❌ Unsupported file type: {file_type}")
+            return None
+            
+        if temp_pdf.exists():
+            print(f"✅ Successfully converted to PDF: {temp_pdf}")
+            return str(temp_pdf)
+        else:
+            print("❌ Conversion failed - PDF file not created")
+            return None
+            
+    except subprocess.TimeoutExpired:
+        print("❌ Conversion timed out")
+        return None
+    except Exception as e:
+        print(f"❌ Conversion error: {e}")
+        return None
+
+
+def convert_with_libreoffice(input_file, output_pdf, file_type):
+    """Convert files using LibreOffice as fallback"""
+    try:
+        print(f"  Using LibreOffice for {file_type} conversion...")
+        
+        # LibreOffice command
+        cmd = [
+            'soffice', '--headless', '--convert-to', 'pdf',
+            '--outdir', str(output_pdf.parent),
+            str(input_file)
+        ]
+        
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+        
+        if result.returncode == 0:
+            # LibreOffice creates PDF with same name as input
+            input_name = Path(input_file).stem
+            libreoffice_pdf = os.path.dirname(output_pdf) + "/" + f"{input_name}.pdf"
+            
+            if libreoffice_pdf.exists():
+                # Rename to our expected temp name
+                libreoffice_pdf.rename(output_pdf)
+                print(f"✅ LibreOffice conversion successful: {output_pdf}")
+                return str(output_pdf)
+        
+        print(f"⚠️  LibreOffice conversion failed: {result.stderr}")
+        return None
+        
+    except subprocess.TimeoutExpired:
+        print("❌ LibreOffice conversion timed out")
+        return None
+    except Exception as e:
+        print(f"❌ LibreOffice conversion error: {e}")
+        return None
--- a/modules/markdown_utils.py
+++ b/modules/markdown_utils.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+import re
+import requests
+import json
+
+
+def clean_markdown_text(text):
+    """Clean markdown text to ensure it's plaintext with no special characters"""
+    if not text:
+        return ""
+    
+    # Remove LaTeX commands and math expressions
+    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # Remove \command{content}
+    text = re.sub(r'\$[^$]*\$', '', text)  # Remove $math$ expressions
+    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove remaining \commands
+    
+    # Remove markdown formatting but keep the text
+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # Remove bold **text**
+    text = re.sub(r'\*([^*]+)\*', r'\1', text)  # Remove italic *text*
+    text = re.sub(r'`([^`]+)`', r'\1', text)  # Remove code `text`
+    text = re.sub(r'#{1,6}\s*', '', text)  # Remove headers # ## ###
+    
+    # Remove special characters but keep basic punctuation
+    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
+    
+    # Clean up multiple spaces and newlines
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\n\s*\n', '\n\n', text)
+    
+    return text.strip()
+
+
+def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
+    """Create markdown content for a single slide with all agentic analyses and text content"""
+    
+    markdown = f"""# Slide {slide_num}
+
+![Slide {slide_num}](slides/{slide_data['filename']})
+
+"""
+    
+    # Add text content if available
+    if slide_text and slide_text.strip():
+        # Clean the slide text to ensure it's plaintext
+        clean_slide_text = clean_markdown_text(slide_text)
+        markdown += f"""## Text Content
+
+{clean_slide_text}
+
+"""
+    
+    markdown += """## Agentic Analysis
+
+"""
+    
+    for prompt_key, result in analysis_results.items():
+        # Clean the analysis text to ensure it's plaintext
+        clean_analysis = clean_markdown_text(result['analysis'])
+        
+        markdown += f"""### {result['agent']}
+
+{clean_analysis}
+
+"""
+    
+    markdown += "---\n\n"
+    return markdown
+
+
+def create_text_only_markdown(markdown_content):
+    """Create a text-only version of markdown without image references for API submission"""
+    # Remove image markdown blocks but keep the text descriptions and analysis
+    text_only = markdown_content
+    
+    # Remove image embedding lines
+    text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)
+    
+    # Remove image link lines
+    text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)
+    
+    # Remove horizontal rules that were added for slide separation
+    text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)
+    
+    # Clean up extra newlines
+    text_only = re.sub(r'\n{3,}', '\n\n', text_only)
+    
+    # Apply final text cleaning to ensure plaintext
+    text_only = clean_markdown_text(text_only)
+    
+    return text_only.strip()
+
+
+def send_to_api_and_get_haste_link(markdown_content, document_title):
+    """Send markdown to API and get both raw markdown and HTML URLs"""
+    try:
+        print("Sending to API for URLs...")
+        
+        # Create text-only version for API
+        text_only_markdown = create_text_only_markdown(markdown_content)
+        
+        # First, send raw markdown to haste.nixc.us
+        raw_haste_url = None
+        try:
+            print("  📝 Creating raw markdown URL...")
+            raw_response = requests.post(
+                "https://haste.nixc.us/documents",
+                data=text_only_markdown.encode('utf-8'),
+                headers={"Content-Type": "text/plain"},
+                timeout=30
+            )
+            
+            if raw_response.status_code == 200:
+                raw_token = raw_response.text.strip().strip('"')
+                # Extract just the token from JSON response if needed
+                if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
+                    import json
+                    try:
+                        token_data = json.loads(raw_token)
+                        raw_token = token_data['key']
+                    except:
+                        pass
+                raw_haste_url = f"https://haste.nixc.us/{raw_token}"
+                print(f"  ✅ Raw markdown URL created")
+            else:
+                print(f"  ⚠️  Raw markdown upload failed with status {raw_response.status_code}")
+        except Exception as e:
+            print(f"  ⚠️  Failed to create raw markdown URL: {e}")
+        
+        # Then, send to md.colinknapp.com for HTML version
+        html_url = None
+        try:
+            print("  🎨 Creating HTML version URL...")
+            api_data = {
+                "markdown": text_only_markdown,
+                "format": "html",
+                "template": "playful",
+                "title": f"Pitch Deck Analysis: {document_title}",
+                "subtitle": "AI-Generated Analysis with Agentic Insights",
+                "contact": "Generated by Pitch Deck Parser",
+                "send_to_haste": True
+            }
+            
+            response = requests.post(
+                "https://md.colinknapp.com/api/convert",
+                headers={"Content-Type": "application/json"},
+                data=json.dumps(api_data),
+                timeout=30
+            )
+            
+            if response.status_code == 200:
+                result = response.json()
+                if 'haste_url' in result:
+                    # Extract token from haste_url and format as requested
+                    haste_url = result['haste_url']
+                    if 'haste.nixc.us/' in haste_url:
+                        token = haste_url.split('haste.nixc.us/')[-1]
+                        html_url = f"https://md.colinknapp.com/haste/{token}"
+                    else:
+                        html_url = haste_url
+                    print(f"  ✅ HTML version URL created")
+                else:
+                    print("  ⚠️  API response missing haste_url")
+            else:
+                print(f"  ⚠️  HTML API request failed with status {response.status_code}")
+        except Exception as e:
+            print(f"  ⚠️  Failed to create HTML URL: {e}")
+        
+        return raw_haste_url, html_url
+            
+    except Exception as e:
+        print(f"⚠️  Failed to send to API: {e}")
+        return None, None
--- a/modules/market_cap_validator.py
+++ b/modules/market_cap_validator.py
@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+
+"""
+Market Cap Validator - Main Interface
+
+This module provides a simple interface to validate market cap claims
+from pitch deck slides using RAG search capabilities.
+"""
+
+import os
+import json
+from typing import List, Dict, Any, Optional
+from .rag_agent import MarketCapRAGAgent
+from .validation_report import ValidationReportGenerator
+
+
+class MarketCapValidator:
+    """
+    Main interface for market cap validation using RAG search
+    """
+    
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize the market cap validator
+        
+        Args:
+            api_key: OpenRouter API key (if not provided, will use environment variable)
+        """
+        self.rag_agent = MarketCapRAGAgent(api_key)
+        self.report_generator = ValidationReportGenerator()
+    
+    def validate_from_slides(self, slide_texts: List[Dict[str, Any]], 
+                           save_report: bool = True) -> Dict[str, Any]:
+        """
+        Validate market cap claims from slide text exports
+        
+        Args:
+            slide_texts: List of slide data with 'slide_number' and 'text' keys
+            save_report: Whether to save the validation report to file
+            
+        Returns:
+            Dictionary containing validation results and report
+        """
+        print("🔍 Starting market cap validation process...")
+        
+        # Extract and validate claims
+        validation_results = self.rag_agent.validate_all_claims(slide_texts)
+        
+        # Generate report
+        report = self.report_generator.generate_report(validation_results, slide_texts)
+        
+        # Save report if requested
+        report_filename = None
+        if save_report:
+            report_filename = self.report_generator.save_report(report)
+            print(f"📄 Validation report saved to: {report_filename}")
+        
+        # Prepare summary
+        summary = self._generate_summary(validation_results)
+        
+        return {
+            'validation_results': validation_results,
+            'report': report,
+            'report_filename': report_filename,
+            'summary': summary
+        }
+    
+    def validate_from_file(self, file_path: str, save_report: bool = True) -> Dict[str, Any]:
+        """
+        Validate market cap claims from a JSON file containing slide texts
+        
+        Args:
+            file_path: Path to JSON file with slide data
+            save_report: Whether to save the validation report to file
+            
+        Returns:
+            Dictionary containing validation results and report
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                slide_texts = json.load(f)
+            
+            print(f"📁 Loaded slide data from: {file_path}")
+            return self.validate_from_slides(slide_texts, save_report)
+            
+        except FileNotFoundError:
+            raise FileNotFoundError(f"File not found: {file_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON file: {e}")
+    
+    def validate_from_processed_folder(self, folder_path: str = "processed", 
+                                    save_report: bool = True) -> Dict[str, Any]:
+        """
+        Validate market cap claims from processed slide files
+        
+        Args:
+            folder_path: Path to folder containing processed slide files
+            save_report: Whether to save the validation report to file
+            
+        Returns:
+            Dictionary containing validation results and report
+        """
+        slide_texts = []
+        
+        # Look for JSON files in the processed folder
+        if os.path.exists(folder_path):
+            for filename in os.listdir(folder_path):
+                if filename.endswith('.json'):
+                    file_path = os.path.join(folder_path, filename)
+                    try:
+                        with open(file_path, 'r', encoding='utf-8') as f:
+                            data = json.load(f)
+                            
+                        # Handle different JSON structures
+                        if isinstance(data, list):
+                            slide_texts.extend(data)
+                        elif isinstance(data, dict) and 'slides' in data:
+                            slide_texts.extend(data['slides'])
+                        elif isinstance(data, dict) and 'text' in data:
+                            slide_texts.append(data)
+                            
+                    except (json.JSONDecodeError, KeyError) as e:
+                        print(f"⚠️ Skipping invalid file {filename}: {e}")
+                        continue
+        
+        if not slide_texts:
+            raise ValueError(f"No valid slide data found in {folder_path}")
+        
+        print(f"📁 Loaded {len(slide_texts)} slides from processed folder")
+        return self.validate_from_slides(slide_texts, save_report)
+    
+    def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
+        """Generate a summary of validation results"""
+        total_claims = len(validation_results)
+        accurate_claims = sum(1 for r in validation_results if r.is_accurate)
+        inaccurate_claims = total_claims - accurate_claims
+        
+        return {
+            'total_claims': total_claims,
+            'accurate_claims': accurate_claims,
+            'inaccurate_claims': inaccurate_claims,
+            'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
+            'claims_by_slide': self._group_claims_by_slide(validation_results)
+        }
+    
+    def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
+        """Group claims by slide number"""
+        claims_by_slide = {}
+        for result in validation_results:
+            slide_num = result.claim.slide_number
+            if slide_num not in claims_by_slide:
+                claims_by_slide[slide_num] = []
+            claims_by_slide[slide_num].append(result)
+        return claims_by_slide
+
+
+def validate_market_caps(slide_texts: List[Dict[str, Any]], 
+                       api_key: Optional[str] = None,
+                       save_report: bool = True) -> Dict[str, Any]:
+    """
+    Convenience function to validate market cap claims
+    
+    Args:
+        slide_texts: List of slide data with 'slide_number' and 'text' keys
+        api_key: OpenRouter API key (optional)
+        save_report: Whether to save the validation report to file
+        
+    Returns:
+        Dictionary containing validation results and report
+    """
+    validator = MarketCapValidator(api_key)
+    return validator.validate_from_slides(slide_texts, save_report)
+
+
+def validate_market_caps_from_file(file_path: str, 
+                                 api_key: Optional[str] = None,
+                                 save_report: bool = True) -> Dict[str, Any]:
+    """
+    Convenience function to validate market cap claims from a file
+    
+    Args:
+        file_path: Path to JSON file with slide data
+        api_key: OpenRouter API key (optional)
+        save_report: Whether to save the validation report to file
+        
+    Returns:
+        Dictionary containing validation results and report
+    """
+    validator = MarketCapValidator(api_key)
+    return validator.validate_from_file(file_path, save_report)
+
+
+def validate_market_caps_from_processed(folder_path: str = "processed",
+                                       api_key: Optional[str] = None,
+                                       save_report: bool = True) -> Dict[str, Any]:
+    """
+    Convenience function to validate market cap claims from processed folder
+    
+    Args:
+        folder_path: Path to folder containing processed slide files
+        api_key: OpenRouter API key (optional)
+        save_report: Whether to save the validation report to file
+        
+    Returns:
+        Dictionary containing validation results and report
+    """
+    validator = MarketCapValidator(api_key)
+    return validator.validate_from_processed_folder(folder_path, save_report)
+
+
+if __name__ == "__main__":
+    # Example usage
+    print("Market Cap Validator - RAG Agent")
+    print("=================================")
+    
+    # Try to validate from processed folder
+    try:
+        results = validate_market_caps_from_processed()
+        
+        print(f"\n✅ Validation Complete!")
+        print(f"📊 Summary:")
+        print(f"   - Total Claims: {results['summary']['total_claims']}")
+        print(f"   - Accurate: {results['summary']['accurate_claims']}")
+        print(f"   - Inaccurate: {results['summary']['inaccurate_claims']}")
+        print(f"   - Accuracy Rate: {results['summary']['accuracy_rate']:.1f}%")
+        
+        if results['report_filename']:
+            print(f"📄 Report saved to: {results['report_filename']}")
+            
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        print("\nUsage examples:")
+        print("1. Place slide data JSON files in 'processed/' folder")
+        print("2. Run: python -m modules.market_cap_validator")
+        print("3. Or use the functions directly in your code")
--- a/modules/pdf_processor.py
+++ b/modules/pdf_processor.py
@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import base64
+import fitz  # PyMuPDF for PDF processing
+from pathlib import Path
+
+
+def extract_slides_from_pdf(pdf_path, output_dir, document_name):
+    """Extract individual slides from PDF as images"""
+    print(f"Extracting slides from PDF: {pdf_path}")
+    
+    # Create processed directory structure: ./processed/DocumentName/
+    processed_dir = Path("processed") / document_name
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Create slides directory within processed directory
+    slides_dir = processed_dir / "slides"
+    slides_dir.mkdir(exist_ok=True)
+    
+    slides = []
+    
+    try:
+        # Open PDF with PyMuPDF
+        pdf_document = fitz.open(pdf_path)
+        
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            
+            # Convert page to image (high resolution)
+            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
+            pix = page.get_pixmap(matrix=mat)
+            
+            # Save as PNG with document name prefix
+            slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
+            slide_path = slides_dir / slide_filename
+            
+            pix.save(str(slide_path))
+            
+            # Convert to base64 for API
+            img_data = pix.tobytes("png")
+            img_base64 = base64.b64encode(img_data).decode('utf-8')
+            
+            slides.append({
+                'page_num': page_num + 1,
+                'filename': slide_filename,
+                'path': slide_path,
+                'base64': img_base64,
+                'document_name': document_name,
+                'processed_dir': processed_dir
+            })
+            
+            print(f"  Extracted slide {page_num + 1}")
+        
+        pdf_document.close()
+        print(f"✅ Extracted {len(slides)} slides")
+        return slides
+        
+    except Exception as e:
+        print(f"❌ Error extracting slides: {e}")
+        return []
--- a/modules/rag_agent.py
+++ b/modules/rag_agent.py
@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+
+import re
+import json
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+from .client import get_openrouter_client
+
+
+@dataclass
+class MarketCapClaim:
+    """Represents a market cap claim found in slide text"""
+    slide_number: int
+    company_name: str
+    claimed_market_cap: str
+    raw_text: str
+    confidence: float
+
+
+@dataclass
+class ValidationResult:
+    """Represents the validation result for a market cap claim"""
+    claim: MarketCapClaim
+    validated_market_cap: Optional[str]
+    validation_source: str
+    confidence_score: float
+    is_accurate: bool
+    discrepancy: Optional[str]
+    rag_search_query: str
+    rag_response: str
+
+
+class MarketCapRAGAgent:
+    """
+    RAG Agent for validating market cap claims from pitch deck slides
+    using OpenRouter's web search capabilities
+    """
+    
+    def __init__(self, api_key: Optional[str] = None):
+        self.client = get_openrouter_client()
+        self.market_cap_patterns = [
+            r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)',
+            r'valuation\s*:?\s*\$?([0-9,.]+[BMK]?)',
+            r'worth\s*:?\s*\$?([0-9,.]+[BMK]?)',
+            r'valued\s+at\s*:?\s*\$?([0-9,.]+[BMK]?)',
+            r'\$([0-9,.]+[BMK]?)\s+(?:market\s+cap|valuation)',
+            r'(?:market\s+cap|valuation)\s+of\s+\$?([0-9,.]+[BMK]?)'
+        ]
+    
+    def extract_market_cap_claims(self, slide_texts: List[Dict[str, Any]]) -> List[MarketCapClaim]:
+        """
+        Extract market cap claims from slide text exports
+        
+        Args:
+            slide_texts: List of slide data with 'slide_number' and 'text' keys
+            
+        Returns:
+            List of MarketCapClaim objects
+        """
+        claims = []
+        
+        for slide_data in slide_texts:
+            slide_number = slide_data.get('slide_number', 0)
+            text = slide_data.get('text', '')
+            
+            if not text:
+                continue
+                
+            # Extract company name (usually in first few lines or title)
+            company_name = self._extract_company_name(text)
+            
+            # Search for market cap patterns
+            for pattern in self.market_cap_patterns:
+                matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
+                
+                for match in matches:
+                    claimed_value = match.group(1)
+                    raw_text = match.group(0)
+                    
+                    # Calculate confidence based on context
+                    confidence = self._calculate_confidence(text, match.start(), match.end())
+                    
+                    claim = MarketCapClaim(
+                        slide_number=slide_number,
+                        company_name=company_name,
+                        claimed_market_cap=claimed_value,
+                        raw_text=raw_text,
+                        confidence=confidence
+                    )
+                    claims.append(claim)
+        
+        return claims
+    
+    def _extract_company_name(self, text: str) -> str:
+        """Extract company name from slide text"""
+        lines = text.split('\n')[:5]  # Check first 5 lines
+        
+        for line in lines:
+            line = line.strip()
+            if line and len(line) > 2 and len(line) < 100:
+                # Skip common slide headers
+                if not any(header in line.lower() for header in ['slide', 'page', 'agenda', 'overview']):
+                    return line
+        
+        return "Unknown Company"
+    
+    def _calculate_confidence(self, text: str, start: int, end: int) -> float:
+        """Calculate confidence score for a market cap claim"""
+        confidence = 0.5  # Base confidence
+        
+        # Extract context around the match
+        context_start = max(0, start - 50)
+        context_end = min(len(text), end + 50)
+        context = text[context_start:context_end].lower()
+        
+        # Increase confidence for specific indicators
+        if any(indicator in context for indicator in ['current', 'latest', 'as of', '2024', '2025']):
+            confidence += 0.2
+        
+        if any(indicator in context for indicator in ['billion', 'million', 'trillion']):
+            confidence += 0.1
+        
+        if 'market cap' in context or 'valuation' in context:
+            confidence += 0.2
+        
+        return min(confidence, 1.0)
+    
+    def validate_claim_with_rag(self, claim: MarketCapClaim) -> ValidationResult:
+        """
+        Validate a market cap claim using RAG search
+        
+        Args:
+            claim: MarketCapClaim to validate
+            
+        Returns:
+            ValidationResult with validation details
+        """
+        # Construct RAG search query
+        search_query = f"{claim.company_name} current market cap valuation 2024 2025"
+        
+        try:
+            # Use OpenRouter with online search enabled
+            response = self.client.chat.completions.create(
+                model="mistralai/mistral-small",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"""
+                        Please search for the current market cap or valuation of {claim.company_name}.
+                        
+                        The company claims their market cap is ${claim.claimed_market_cap}.
+                        
+                        Please provide:
+                        1. The current market cap/valuation if found
+                        2. The source of this information
+                        3. Whether the claimed value appears accurate
+                        4. Any significant discrepancies
+                        
+                        Focus on recent data from 2024-2025.
+                        """
+                    }
+                ],
+                max_tokens=800
+            )
+            
+            rag_response = response.choices[0].message.content.strip()
+            
+            # Parse the response to extract validation details
+            validation_details = self._parse_rag_response(rag_response, claim)
+            
+            return ValidationResult(
+                claim=claim,
+                validated_market_cap=validation_details.get('validated_cap'),
+                validation_source=validation_details.get('source', 'RAG Search'),
+                confidence_score=validation_details.get('confidence', 0.5),
+                is_accurate=validation_details.get('is_accurate', False),
+                discrepancy=validation_details.get('discrepancy'),
+                rag_search_query=search_query,
+                rag_response=rag_response
+            )
+            
+        except Exception as e:
+            return ValidationResult(
+                claim=claim,
+                validated_market_cap=None,
+                validation_source="Error",
+                confidence_score=0.0,
+                is_accurate=False,
+                discrepancy=f"RAG search failed: {str(e)}",
+                rag_search_query=search_query,
+                rag_response=f"Error: {str(e)}"
+            )
+    
+    def _parse_rag_response(self, response: str, claim: MarketCapClaim) -> Dict[str, Any]:
+        """Parse RAG response to extract validation details"""
+        details = {
+            'validated_cap': None,
+            'source': 'RAG Search',
+            'confidence': 0.5,
+            'is_accurate': False,
+            'discrepancy': None
+        }
+        
+        response_lower = response.lower()
+        
+        # Look for market cap values in the response
+        cap_patterns = [
+            r'\$([0-9,.]+[BMK]?)',
+            r'([0-9,.]+[BMK]?)\s+(?:billion|million|trillion)',
+            r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)'
+        ]
+        
+        for pattern in cap_patterns:
+            matches = re.findall(pattern, response_lower)
+            if matches:
+                details['validated_cap'] = matches[0]
+                break
+        
+        # Determine accuracy
+        if details['validated_cap']:
+            claimed_normalized = self._normalize_value(claim.claimed_market_cap)
+            validated_normalized = self._normalize_value(details['validated_cap'])
+            
+            if claimed_normalized and validated_normalized:
+                # Allow for some variance (within 20%)
+                ratio = min(claimed_normalized, validated_normalized) / max(claimed_normalized, validated_normalized)
+                details['is_accurate'] = ratio > 0.8
+                
+                if not details['is_accurate']:
+                    details['discrepancy'] = f"Claimed: ${claim.claimed_market_cap}, Found: ${details['validated_cap']}"
+        
+        # Extract source information
+        if 'source:' in response_lower or 'according to' in response_lower:
+            source_match = re.search(r'(?:source:|according to)\s*([^\n]+)', response_lower)
+            if source_match:
+                details['source'] = source_match.group(1).strip()
+        
+        return details
+    
+    def _normalize_value(self, value: str) -> Optional[float]:
+        """Normalize market cap value to a comparable number"""
+        if not value:
+            return None
+            
+        value = value.replace(',', '').upper()
+        
+        multiplier = 1
+        if value.endswith('B'):
+            multiplier = 1_000_000_000
+            value = value[:-1]
+        elif value.endswith('M'):
+            multiplier = 1_000_000
+            value = value[:-1]
+        elif value.endswith('K'):
+            multiplier = 1_000
+            value = value[:-1]
+        elif value.endswith('T'):
+            multiplier = 1_000_000_000_000
+            value = value[:-1]
+        
+        try:
+            return float(value) * multiplier
+        except ValueError:
+            return None
+    
+    def validate_all_claims(self, slide_texts: List[Dict[str, Any]]) -> List[ValidationResult]:
+        """
+        Extract and validate all market cap claims from slide texts
+        
+        Args:
+            slide_texts: List of slide data with 'slide_number' and 'text' keys
+            
+        Returns:
+            List of ValidationResult objects
+        """
+        claims = self.extract_market_cap_claims(slide_texts)
+        results = []
+        
+        print(f"Found {len(claims)} market cap claims to validate...")
+        
+        for i, claim in enumerate(claims, 1):
+            print(f"  Validating claim {i}/{len(claims)}: {claim.company_name} - ${claim.claimed_market_cap}")
+            result = self.validate_claim_with_rag(claim)
+            results.append(result)
+        
+        return results
--- a/modules/requirements.txt
+++ b/modules/requirements.txt
@ -0,0 +1,6 @@
+pdf2image
+openai
+requests
+PyMuPDF
+docling
+python-dotenv
--- a/modules/validate_market_caps.py
+++ b/modules/validate_market_caps.py
@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+"""
+Clean Market Cap Validation CLI
+
+Validates market cap claims from pitch deck slides using RAG search.
+Reports are automatically organized in the processed/ directory.
+"""
+
+import sys
+import os
+import argparse
+from modules.document_validator import (
+    validate_document_claims,
+    validate_all_processed_documents
+)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate market cap claims from pitch deck slides using RAG search"
+    )
+    
+    parser.add_argument(
+        '--file', '-f',
+        help='Path to JSON file containing slide data'
+    )
+    
+    parser.add_argument(
+        '--document', '-d',
+        help='Document name for organized reporting'
+    )
+    
+    parser.add_argument(
+        '--all',
+        action='store_true',
+        help='Validate all documents in processed/ folder'
+    )
+    
+    parser.add_argument(
+        '--no-save',
+        action='store_true',
+        help='Do not save validation report to file'
+    )
+    
+    parser.add_argument(
+        '--api-key',
+        help='OpenRouter API key (or set OPENROUTER_API_KEY environment variable)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Get API key
+    api_key = args.api_key or os.getenv('OPENROUTER_API_KEY')
+    if not api_key:
+        print("❌ Error: OpenRouter API key required")
+        print("   Set OPENROUTER_API_KEY environment variable or use --api-key")
+        sys.exit(1)
+    
+    try:
+        print("🔍 Market Cap Validation with RAG Search")
+        print("=========================================")
+        
+        if args.all:
+            print("📁 Validating all documents in processed/ folder")
+            results = validate_all_processed_documents(api_key=api_key)
+            
+            print(f"\n✅ Validation Complete!")
+            print(f"📊 Processed {len(results)} documents:")
+            
+            for doc_name, doc_results in results.items():
+                if 'error' in doc_results:
+                    print(f"   ❌ {doc_name}: {doc_results['error']}")
+                else:
+                    summary = doc_results['summary']
+                    print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
+                    if doc_results['report_filename']:
+                        print(f"      📄 Report: {doc_results['report_filename']}")
+        
+        elif args.file:
+            document_name = args.document or "Unknown-Document"
+            print(f"📁 Validating from file: {args.file}")
+            
+            import json
+            with open(args.file, 'r', encoding='utf-8') as f:
+                slide_data = json.load(f)
+            
+            results = validate_document_claims(
+                document_name, 
+                slide_data,
+                api_key=api_key,
+                save_report=not args.no_save
+            )
+            
+            # Display results
+            summary = results['summary']
+            print(f"\n✅ Validation Complete!")
+            print(f"📊 Results Summary:")
+            print(f"   - Total Claims Found: {summary['total_claims']}")
+            print(f"   - Accurate Claims: {summary['accurate_claims']}")
+            print(f"   - Inaccurate Claims: {summary['inaccurate_claims']}")
+            print(f"   - Accuracy Rate: {summary['accuracy_rate']:.1f}%")
+            
+            if results['report_filename']:
+                print(f"📄 Detailed report saved to: {results['report_filename']}")
+        
+        else:
+            print("📁 Validating all documents in processed/ folder (default)")
+            results = validate_all_processed_documents(api_key=api_key)
+            
+            print(f"\n✅ Validation Complete!")
+            print(f"📊 Processed {len(results)} documents:")
+            
+            for doc_name, doc_results in results.items():
+                if 'error' in doc_results:
+                    print(f"   ❌ {doc_name}: {doc_results['error']}")
+                else:
+                    summary = doc_results['summary']
+                    print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
+                    if doc_results['report_filename']:
+                        print(f"      📄 Report: {doc_results['report_filename']}")
+        
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/validation_report.py
+++ b/modules/validation_report.py
@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+
+from typing import List, Dict, Any
+from datetime import datetime
+import os
+from .rag_agent import ValidationResult, MarketCapClaim
+
+
+class ValidationReportGenerator:
+    """
+    Generates comprehensive validation reports for market cap claims
+    with slide source tracking
+    """
+    
+    def __init__(self):
+        self.report_sections = []
+    
+    def generate_report(self, validation_results: List[ValidationResult], 
+                       slide_texts: List[Dict[str, Any]]) -> str:
+        """
+        Generate a comprehensive validation report
+        
+        Args:
+            validation_results: List of ValidationResult objects
+            slide_texts: Original slide text data for context
+            
+        Returns:
+            Formatted markdown report string
+        """
+        report = []
+        
+        # Header
+        report.append(self._generate_header())
+        
+        # Executive Summary
+        report.append(self._generate_executive_summary(validation_results))
+        
+        # Detailed Results
+        report.append(self._generate_detailed_results(validation_results))
+        
+        # Slide Source Analysis
+        report.append(self._generate_slide_source_analysis(validation_results, slide_texts))
+        
+        # RAG Search Details
+        report.append(self._generate_rag_search_details(validation_results))
+        
+        # Recommendations
+        report.append(self._generate_recommendations(validation_results))
+        
+        return '\n\n'.join(report)
+    
+    def _generate_header(self) -> str:
+        """Generate report header"""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        return f"""# Market Cap Validation Report
+
+**Generated:** {timestamp}  
+**Report Type:** RAG-Enhanced Validation Analysis  
+**Validation Method:** OpenRouter Web Search Integration
+
+---
+"""
+    
+    def _generate_executive_summary(self, results: List[ValidationResult]) -> str:
+        """Generate executive summary section"""
+        total_claims = len(results)
+        accurate_claims = sum(1 for r in results if r.is_accurate)
+        inaccurate_claims = total_claims - accurate_claims
+        high_confidence = sum(1 for r in results if r.confidence_score > 0.7)
+        
+        accuracy_rate = (accurate_claims / total_claims * 100) if total_claims > 0 else 0
+        
+        return f"""## Executive Summary
+
+### Key Metrics
+- **Total Market Cap Claims Analyzed:** {total_claims}
+- **Claims Validated as Accurate:** {accurate_claims} ({accuracy_rate:.1f}%)
+- **Claims with Discrepancies:** {inaccurate_claims}
+- **High Confidence Validations:** {high_confidence}
+
+### Overall Assessment
+{'✅ **GOOD** - Most claims appear accurate' if accuracy_rate > 70 else '⚠️ **CAUTION** - Significant discrepancies found' if accuracy_rate < 50 else '🔍 **MIXED** - Some claims require verification'}
+
+---
+"""
+    
+    def _generate_detailed_results(self, results: List[ValidationResult]) -> str:
+        """Generate detailed validation results"""
+        if not results:
+            return "## Detailed Results\n\nNo market cap claims found in the analyzed slides.\n\n---"
+        
+        report = ["## Detailed Validation Results\n"]
+        
+        for i, result in enumerate(results, 1):
+            status_icon = "✅" if result.is_accurate else "❌" if result.discrepancy else "⚠️"
+            confidence_bar = self._generate_confidence_bar(result.confidence_score)
+            
+            report.append(f"""### {status_icon} Claim #{i}: {result.claim.company_name}
+
+**Slide Source:** Slide {result.claim.slide_number}  
+**Claimed Market Cap:** ${result.claim.claimed_market_cap}  
+**Raw Text:** `{result.claim.raw_text}`  
+**Confidence Score:** {confidence_bar} ({result.confidence_score:.2f})
+
+**Validation Results:**
+- **Validated Market Cap:** {result.validated_market_cap or 'Not found'}
+- **Validation Source:** {result.validation_source}
+- **Accuracy Status:** {'✅ Accurate' if result.is_accurate else '❌ Inaccurate' if result.discrepancy else '⚠️ Uncertain'}
+""")
+            
+            if result.discrepancy:
+                report.append(f"- **Discrepancy:** {result.discrepancy}")
+            
+            report.append(f"- **RAG Search Query:** `{result.rag_search_query}`")
+            report.append("")
+        
+        report.append("---")
+        return '\n'.join(report)
+    
+    def _generate_slide_source_analysis(self, results: List[ValidationResult], 
+                                      slide_texts: List[Dict[str, Any]]) -> str:
+        """Generate slide source analysis section"""
+        report = ["## Slide Source Analysis\n"]
+        
+        # Group results by slide
+        slide_claims = {}
+        for result in results:
+            slide_num = result.claim.slide_number
+            if slide_num not in slide_claims:
+                slide_claims[slide_num] = []
+            slide_claims[slide_num].append(result)
+        
+        # Find slide texts
+        slide_text_map = {s.get('slide_number', 0): s.get('text', '') for s in slide_texts}
+        
+        for slide_num in sorted(slide_claims.keys()):
+            claims = slide_claims[slide_num]
+            slide_text = slide_text_map.get(slide_num, 'No text available')
+            
+            report.append(f"""### Slide {slide_num} Analysis
+
+**Claims Found:** {len(claims)}  
+**Slide Text Preview:** {slide_text[:200]}{'...' if len(slide_text) > 200 else ''}
+
+**Claims Details:**""")
+            
+            for claim in claims:
+                status = "✅ Accurate" if any(r.claim == claim and r.is_accurate for r in results) else "❌ Inaccurate"
+                report.append(f"- {claim.company_name}: ${claim.claimed_market_cap} - {status}")
+            
+            report.append("")
+        
+        report.append("---")
+        return '\n'.join(report)
+    
+    def _generate_rag_search_details(self, results: List[ValidationResult]) -> str:
+        """Generate RAG search details section"""
+        report = ["## RAG Search Details\n"]
+        
+        report.append("### Search Methodology")
+        report.append("- **Search Engine:** OpenRouter with Exa integration")
+        report.append("- **Model:** Mistral Small with online search enabled")
+        report.append("- **Search Focus:** Current market cap data (2024-2025)")
+        report.append("- **Validation Threshold:** 80% accuracy tolerance")
+        report.append("")
+        
+        report.append("### Search Queries Used")
+        unique_queries = list(set(r.rag_search_query for r in results))
+        for i, query in enumerate(unique_queries, 1):
+            report.append(f"{i}. `{query}`")
+        report.append("")
+        
+        report.append("### Sample RAG Responses")
+        for i, result in enumerate(results[:3], 1):  # Show first 3 responses
+            report.append(f"""#### Response #{i}: {result.claim.company_name}
+```
+{result.rag_response[:300]}{'...' if len(result.rag_response) > 300 else ''}
+```""")
+        
+        report.append("---")
+        return '\n'.join(report)
+    
+    def _generate_recommendations(self, results: List[ValidationResult]) -> str:
+        """Generate recommendations section"""
+        inaccurate_results = [r for r in results if not r.is_accurate and r.discrepancy]
+        high_confidence_results = [r for r in results if r.confidence_score > 0.7]
+        
+        report = ["## Recommendations\n"]
+        
+        if inaccurate_results:
+            report.append("### ⚠️ Claims Requiring Attention")
+            for result in inaccurate_results:
+                report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - {result.discrepancy}")
+            report.append("")
+        
+        if high_confidence_results:
+            report.append("### ✅ High Confidence Validations")
+            report.append("The following claims were validated with high confidence:")
+            for result in high_confidence_results:
+                report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - ${result.claim.claimed_market_cap}")
+            report.append("")
+        
+        report.append("### 📋 General Recommendations")
+        report.append("1. **Verify Discrepancies:** Review claims marked as inaccurate with stakeholders")
+        report.append("2. **Update Sources:** Consider updating slide sources with more recent data")
+        report.append("3. **Regular Validation:** Implement periodic validation of financial claims")
+        report.append("4. **Source Attribution:** Always include data sources and dates in financial slides")
+        
+        report.append("\n---")
+        report.append("*Report generated by Market Cap RAG Validation Agent*")
+        
+        return '\n'.join(report)
+    
+    def _generate_confidence_bar(self, confidence: float) -> str:
+        """Generate a visual confidence bar"""
+        filled = int(confidence * 10)
+        empty = 10 - filled
+        return f"[{'█' * filled}{'░' * empty}]"
+    
+    def save_report(self, report: str, filename: str = None, processed_dir: str = "processed") -> str:
+        """Save report to file"""
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"market_cap_validation_report_{timestamp}.md"
+        
+        # Create processed directory if it doesn't exist
+        os.makedirs(processed_dir, exist_ok=True)
+        filepath = os.path.join(processed_dir, filename)
+        
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(report)
+        
+        return filepath
--- a/modules/working_app.py
+++ b/modules/working_app.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+from pathlib import Path
+
+def process_pitch_deck(pdf_path):
+    """Working version that bypasses the signature mess"""
+    print(f"Processing: {pdf_path}")
+    
+    # Import everything we need
+    from client import get_openrouter_client
+    from pdf_processor import extract_slides_from_pdf
+    from analysis import analyze_slides_batch
+    
+    # Extract slides (this works)
+    slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
+    print(f"Extracted {len(slides)} slides")
+    
+    # Analyze slides (this works)
+    client = get_openrouter_client()
+    analysis_results = analyze_slides_batch(client, slides)
+    print("Analysis complete")
+    
+    # Create report manually (bypass the broken create_slide_markdown)
+    markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
+    
+    for i, slide_data in enumerate(slides):
+        slide_num = i + 1
+        analysis = analysis_results.get(slide_num, {})
+        
+        markdown_content += f"## Slide {slide_num}\n\n"
+        markdown_content += f"![Slide {slide_num}](slides/{slide_data['filename']})\n\n"
+        
+        if analysis:
+            markdown_content += f"**Analysis:**\n{analysis}\n\n"
+        else:
+            markdown_content += "**Analysis:** No analysis available\n\n"
+        
+        markdown_content += "---\n\n"
+    
+    # Save report
+    output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
+    os.makedirs("processed", exist_ok=True)
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(markdown_content)
+    
+    print(f"Report saved to: {output_file}")
+    return output_file
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python working_app.py <pdf_path>")
+        sys.exit(1)
+    
+    pdf_path = sys.argv[1]
+    if not os.path.exists(pdf_path):
+        print(f"Error: File '{pdf_path}' not found")
+        sys.exit(1)
+    
+    process_pitch_deck(pdf_path)
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_001.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_001.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_002.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_002.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_003.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_003.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_004.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_004.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_005.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_005.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_006.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_006.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_007.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_007.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_008.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_008.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_009.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_009.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_010.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_010.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_011.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_011.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_012.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_012.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_013.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_013.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_014.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_014.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_015.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_015.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_016.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_016.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_017.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_017.png
--- a/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_018.png
+++ b/processed/AirBnB_Pitch_Deck/slides/AirBnB_Pitch_Deck_slide_018.png
--- a/processed/AirBnB_Pitch_Deck_analysis.md
+++ b/processed/AirBnB_Pitch_Deck_analysis.md
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+pdf2image
+openai
+requests
+PyMuPDF
+docling
+python-dotenv
--- a/start.sh
+++ b/start.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Kill any process running on port 3123
+echo "Killing any existing processes on port 3123..."
+fuser -k 3123/tcp 2>/dev/null || true
+
+# Create virtual environment if it doesn't exist
+if [ ! -d "venv" ]; then
+    echo "Creating virtual environment..."
+    python3 -m venv venv
+fi
+
+# Activate virtual environment
+echo "Activating virtual environment..."
+source venv/bin/activate
+
+# Verify virtual environment is active
+echo "Verifying virtual environment..."
+which python3
+python3 --version
+
+# Install dependencies
+echo "Installing dependencies..."
+pip install -r requirements.txt
+
+# Check for help flag
+if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
+    echo ""
+    echo "Pitch Deck Analysis Application"
+    echo "=============================="
+    echo "Usage: ./start.sh <file_path>"
+    echo "Example: ./start.sh presentation.pdf"
+    echo ""
+    echo "The application will automatically upload the generated report."
+    echo ""
+    exit 0
+fi
+
+# Verify file exists
+if [ -z "$1" ]; then
+    echo "Error: No file specified"
+    echo "Usage: ./start.sh <file_path>"
+    exit 1
+fi
+
+if [ ! -f "$1" ]; then
+    echo "Error: File '$1' not found"
+    exit 1
+fi
+
+# Start the application with immediate feedback
+echo "Starting pitch deck parser..."
+echo "Processing file: $1"
+echo "Python path: $(which python3)"
+echo "Working directory: $(pwd)"
+echo "----------------------------------------"
+
+python3 app.py "$1"