Initial commit: Technical screen project with document analysis capabilities
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
alwaysApply: true
|
||||||
|
---
|
||||||
|
# Code Cleanup Guidelines
|
||||||
|
Remove unused code, imports, and dead functions to keep the codebase clean and maintainable. Regular cleanup prevents technical debt and improves code readability.
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
alwaysApply: true
|
||||||
|
---
|
||||||
|
# Code Length Guidelines
|
||||||
|
Keep all code files under 300 lines for better maintainability and readability. If a file exceeds this limit, consider breaking it into smaller, focused modules.
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
# Environment variables
|
||||||
|
.env
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Temporary files
|
||||||
|
*.tmp
|
||||||
|
*.temp
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def generate_toc(markdown_content):
|
||||||
|
"""Generate a Table of Contents from markdown headers"""
|
||||||
|
print(" 📋 Generating Table of Contents...")
|
||||||
|
lines = markdown_content.split('\n')
|
||||||
|
toc_lines = []
|
||||||
|
toc_lines.append("## Table of Contents")
|
||||||
|
toc_lines.append("")
|
||||||
|
|
||||||
|
header_count = 0
|
||||||
|
for line in lines:
|
||||||
|
# Match headers (##, ###, etc.)
|
||||||
|
header_match = re.match(r'^(#{2,})\s+(.+)$', line)
|
||||||
|
if header_match:
|
||||||
|
header_count += 1
|
||||||
|
level = len(header_match.group(1)) - 2 # Convert ## to 0, ### to 1, etc.
|
||||||
|
title = header_match.group(2)
|
||||||
|
|
||||||
|
# Create anchor link
|
||||||
|
anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
|
||||||
|
anchor = re.sub(r'\s+', '-', anchor.strip())
|
||||||
|
|
||||||
|
# Add indentation based on header level
|
||||||
|
indent = " " * level
|
||||||
|
toc_lines.append(f"{indent}- [{title}](#{anchor})")
|
||||||
|
|
||||||
|
toc_lines.append("")
|
||||||
|
toc_lines.append("---")
|
||||||
|
toc_lines.append("")
|
||||||
|
|
||||||
|
print(f" ✅ Generated TOC with {header_count} headers")
|
||||||
|
return '\n'.join(toc_lines)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Simple pitch deck analyzer"""
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python app.py <pdf_file>")
|
||||||
|
return
|
||||||
|
|
||||||
|
pdf_path = sys.argv[1]
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
print(f"Error: File '{pdf_path}' not found")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"🚀 Processing: {pdf_path}")
|
||||||
|
|
||||||
|
# Import what we need directly (avoid __init__.py issues)
|
||||||
|
print("📦 Importing modules...")
|
||||||
|
sys.path.append('modules')
|
||||||
|
from client import get_openrouter_client
|
||||||
|
from pdf_processor import extract_slides_from_pdf
|
||||||
|
from analysis import analyze_slides_batch
|
||||||
|
from markdown_utils import send_to_api_and_get_haste_link
|
||||||
|
print("✅ Modules imported successfully")
|
||||||
|
|
||||||
|
# Extract slides
|
||||||
|
print("📄 Extracting slides...")
|
||||||
|
slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
|
||||||
|
print(f"✅ Extracted {len(slides)} slides")
|
||||||
|
|
||||||
|
# Analyze slides
|
||||||
|
print("🧠 Analyzing slides...")
|
||||||
|
client = get_openrouter_client()
|
||||||
|
print("🔗 API client initialized")
|
||||||
|
|
||||||
|
analysis_results = analyze_slides_batch(client, slides)
|
||||||
|
print("✅ Analysis complete")
|
||||||
|
|
||||||
|
# Create report
|
||||||
|
print("📝 Creating report...")
|
||||||
|
markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
|
||||||
|
|
||||||
|
# Add analysis metadata
|
||||||
|
markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
|
||||||
|
markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
|
||||||
|
markdown_content += f"**Analysis Generated:** {len(slides)} slides processed\n"
|
||||||
|
markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
|
||||||
|
markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"
|
||||||
|
|
||||||
|
print(f"📊 Building markdown for {len(slides)} slides...")
|
||||||
|
for i, slide_data in enumerate(slides):
|
||||||
|
slide_num = i + 1
|
||||||
|
analysis = analysis_results.get(slide_num, {})
|
||||||
|
|
||||||
|
print(f" 📄 Processing slide {slide_num}...")
|
||||||
|
|
||||||
|
markdown_content += f"# Slide {slide_num}\n\n"
|
||||||
|
markdown_content += f"\n\n"
|
||||||
|
|
||||||
|
if analysis:
|
||||||
|
markdown_content += "## Agentic Analysis\n\n"
|
||||||
|
|
||||||
|
# Format each agent's analysis
|
||||||
|
agent_count = 0
|
||||||
|
for agent_key, agent_data in analysis.items():
|
||||||
|
if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
|
||||||
|
agent_count += 1
|
||||||
|
agent_name = agent_data['agent']
|
||||||
|
agent_analysis = agent_data['analysis']
|
||||||
|
|
||||||
|
markdown_content += f"### {agent_name}\n\n"
|
||||||
|
markdown_content += f"{agent_analysis}\n\n"
|
||||||
|
|
||||||
|
print(f" ✅ Added {agent_count} agent analyses")
|
||||||
|
else:
|
||||||
|
markdown_content += "## Agentic Analysis\n\n"
|
||||||
|
markdown_content += "No analysis available\n\n"
|
||||||
|
print(f" ⚠️ No analysis available for slide {slide_num}")
|
||||||
|
|
||||||
|
markdown_content += "---\n\n"
|
||||||
|
|
||||||
|
# Generate Table of Contents
|
||||||
|
print("📋 Generating Table of Contents...")
|
||||||
|
toc = generate_toc(markdown_content)
|
||||||
|
|
||||||
|
# Insert TOC after the main title
|
||||||
|
print("🔗 Inserting TOC into document...")
|
||||||
|
lines = markdown_content.split('\n')
|
||||||
|
final_content = []
|
||||||
|
final_content.append(lines[0]) # Main title
|
||||||
|
final_content.append("") # Empty line
|
||||||
|
final_content.append(toc) # TOC
|
||||||
|
final_content.extend(lines[2:]) # Rest of content
|
||||||
|
|
||||||
|
final_markdown = '\n'.join(final_content)
|
||||||
|
|
||||||
|
# Save report
|
||||||
|
output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
|
||||||
|
print(f"💾 Saving report to: {output_file}")
|
||||||
|
os.makedirs("processed", exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(final_markdown)
|
||||||
|
|
||||||
|
print(f"✅ Report saved successfully ({len(final_markdown)} characters)")
|
||||||
|
|
||||||
|
# Always upload the report
|
||||||
|
print("🌐 Uploading report...")
|
||||||
|
haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
|
||||||
|
if haste_url:
|
||||||
|
print(f"✅ Report uploaded to: {haste_url}")
|
||||||
|
else:
|
||||||
|
print("❌ Upload failed")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
# OpenRouter API Configuration
|
||||||
|
OPENROUTER_API_KEY=your_openrouter_api_key_here
|
||||||
|
|
||||||
|
# Optional: Custom OpenAI model (defaults to gpt-3.5-turbo)
|
||||||
|
# OPENROUTER_MODEL=openai/gpt-3.5-turbo
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Pitch Deck Parser Modules
|
||||||
|
# This package contains all the modular components for the pitch deck analysis application
|
||||||
|
|
||||||
|
from .client import get_openrouter_client
|
||||||
|
from .file_utils import detect_file_type, convert_to_pdf, convert_with_libreoffice
|
||||||
|
from .pdf_processor import extract_slides_from_pdf
|
||||||
|
from .docling_processor import extract_text_with_docling, get_slide_text_content
|
||||||
|
from .analysis import (
|
||||||
|
analyze_slide_with_single_prompt,
|
||||||
|
analyze_slides_batch,
|
||||||
|
analyze_slide_with_agentic_prompts_parallel,
|
||||||
|
process_single_slide_parallel
|
||||||
|
)
|
||||||
|
from .markdown_utils import (
|
||||||
|
create_slide_markdown,
|
||||||
|
create_text_only_markdown,
|
||||||
|
send_to_api_and_get_haste_link
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'get_openrouter_client',
|
||||||
|
'detect_file_type',
|
||||||
|
'convert_to_pdf',
|
||||||
|
'convert_with_libreoffice',
|
||||||
|
'extract_slides_from_pdf',
|
||||||
|
'extract_text_with_docling',
|
||||||
|
'get_slide_text_content',
|
||||||
|
'analyze_slide_with_single_prompt',
|
||||||
|
'analyze_slides_batch',
|
||||||
|
'analyze_slide_with_agentic_prompts_parallel',
|
||||||
|
'process_single_slide_parallel',
|
||||||
|
'create_slide_markdown',
|
||||||
|
'create_text_only_markdown',
|
||||||
|
'send_to_api_and_get_haste_link'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Market Cap RAG Validation
|
||||||
|
from .rag_agent import MarketCapRAGAgent, MarketCapClaim, ValidationResult
|
||||||
|
from .validation_report import ValidationReportGenerator
|
||||||
|
from .market_cap_validator import (
|
||||||
|
MarketCapValidator,
|
||||||
|
validate_market_caps,
|
||||||
|
validate_market_caps_from_file,
|
||||||
|
validate_market_caps_from_processed
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update __all__ list
|
||||||
|
__all__.extend([
|
||||||
|
'MarketCapRAGAgent',
|
||||||
|
'MarketCapClaim',
|
||||||
|
'ValidationResult',
|
||||||
|
'ValidationReportGenerator',
|
||||||
|
'MarketCapValidator',
|
||||||
|
'validate_market_caps',
|
||||||
|
'validate_market_caps_from_file',
|
||||||
|
'validate_market_caps_from_processed'
|
||||||
|
])
|
||||||
|
|
||||||
|
# Document-specific validation
|
||||||
|
from .document_validator import (
|
||||||
|
DocumentValidator,
|
||||||
|
validate_document_claims,
|
||||||
|
validate_all_processed_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update __all__ list
|
||||||
|
__all__.extend([
|
||||||
|
'DocumentValidator',
|
||||||
|
'validate_document_claims',
|
||||||
|
'validate_all_processed_documents'
|
||||||
|
])
|
||||||
|
|
||||||
|
# Main application and CLI tools
|
||||||
|
from .app import *
|
||||||
|
from .example_usage import *
|
||||||
|
from .validate_market_caps import *
|
||||||
|
|
||||||
|
# Update __all__ list
|
||||||
|
__all__.extend([
|
||||||
|
'app',
|
||||||
|
'example_usage',
|
||||||
|
'validate_market_caps'
|
||||||
|
])
|
||||||
|
|
@ -0,0 +1,90 @@
|
||||||
|
import re
|
||||||
|
from client import get_openrouter_client
|
||||||
|
|
||||||
|
def analyze_slides_batch(client, slides_data, batch_size=1):
|
||||||
|
"""Process slides individually with specialized AI agents"""
|
||||||
|
print(f" Processing {len(slides_data)} slides individually...")
|
||||||
|
|
||||||
|
all_results = {}
|
||||||
|
|
||||||
|
for i, slide_data in enumerate(slides_data):
|
||||||
|
slide_num = slide_data["page_num"]
|
||||||
|
print(f" 🔍 Analyzing slide {slide_num} ({i+1}/{len(slides_data)})...")
|
||||||
|
|
||||||
|
# Define specialized agents
|
||||||
|
agents = {
|
||||||
|
'content_extractor': {
|
||||||
|
'name': 'Content Extractor',
|
||||||
|
'prompt': 'Extract and summarize the key textual content from this slide. Focus on headlines, bullet points, and main messages.'
|
||||||
|
},
|
||||||
|
'visual_analyzer': {
|
||||||
|
'name': 'Visual Analyzer',
|
||||||
|
'prompt': 'Analyze the visual design elements of this slide. Comment on layout, colors, typography, and visual hierarchy.'
|
||||||
|
},
|
||||||
|
'data_interpreter': {
|
||||||
|
'name': 'Data Interpreter',
|
||||||
|
'prompt': 'Identify and interpret any numerical data, charts, graphs, or metrics present on this slide.'
|
||||||
|
},
|
||||||
|
'message_evaluator': {
|
||||||
|
'name': 'Message Evaluator',
|
||||||
|
'prompt': 'Evaluate the effectiveness of the message delivery and communication strategy on this slide.'
|
||||||
|
},
|
||||||
|
'improvement_suggestor': {
|
||||||
|
'name': 'Improvement Suggestor',
|
||||||
|
'prompt': 'Suggest specific improvements for this slide in terms of clarity, impact, and effectiveness.'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slide_analysis = {}
|
||||||
|
|
||||||
|
# Analyze with each specialized agent
|
||||||
|
for j, (agent_key, agent_config) in enumerate(agents.items()):
|
||||||
|
print(f" 🤖 Running {agent_config['name']} ({j+1}/5)...")
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": f"You are a {agent_config['name']} specialized in analyzing pitch deck slides. {agent_config['prompt']}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": f"Analyze slide {slide_num}:"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/png;base64,{slide_data['base64']}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f" 📡 Sending API request...")
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=500
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis = response.choices[0].message.content.strip()
|
||||||
|
print(f" ✅ {agent_config['name']} completed ({len(analysis)} chars)")
|
||||||
|
|
||||||
|
slide_analysis[agent_key] = {
|
||||||
|
'agent': agent_config['name'],
|
||||||
|
'analysis': analysis
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ {agent_config['name']} failed: {str(e)}")
|
||||||
|
slide_analysis[agent_key] = {
|
||||||
|
'agent': agent_config['name'],
|
||||||
|
'analysis': f"Error analyzing slide {slide_num}: {str(e)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
all_results[slide_num] = slide_analysis
|
||||||
|
print(f" ✅ Slide {slide_num} analysis complete")
|
||||||
|
|
||||||
|
print(f" 🎉 All {len(slides_data)} slides analyzed successfully!")
|
||||||
|
return all_results
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from openai import OpenAI
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
def get_openrouter_client():
|
||||||
|
"""Initialize OpenRouter client with API key from .env file"""
|
||||||
|
# Load .env file
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
api_key = os.getenv('OPENROUTER_API_KEY')
|
||||||
|
if not api_key or api_key == 'your_openrouter_api_key_here':
|
||||||
|
print("❌ Error: OPENROUTER_API_KEY not properly set in .env file")
|
||||||
|
print("Please update your .env file with a valid OpenRouter API key")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return OpenAI(
|
||||||
|
base_url="https://openrouter.ai/api/v1",
|
||||||
|
api_key=api_key
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,172 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from pathlib import Path
|
||||||
|
import fitz # PyMuPDF as fallback
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
"""Clean text to ensure it's plaintext with no special characters or LaTeX"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove LaTeX commands and math expressions
|
||||||
|
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
|
||||||
|
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
|
||||||
|
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
|
||||||
|
|
||||||
|
# Remove special characters and normalize
|
||||||
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
|
||||||
|
|
||||||
|
# Clean up multiple spaces and newlines
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_with_docling(pdf_path, output_dir, document_name):
|
||||||
|
"""Extract text content from PDF using Docling with PyMuPDF fallback"""
|
||||||
|
print(f"Extracting text content with Docling: {pdf_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize Docling converter
|
||||||
|
converter = DocumentConverter()
|
||||||
|
# Configure OCR for better text extraction
|
||||||
|
converter.ocr_options.engine = "rapidocr" # Use faster OCR engine
|
||||||
|
converter.ocr_options.do_ocr = True
|
||||||
|
converter.ocr_options.do_table_ocr = True
|
||||||
|
|
||||||
|
# Convert PDF to text
|
||||||
|
result = converter.convert(pdf_path)
|
||||||
|
|
||||||
|
# Get the text content
|
||||||
|
text_content = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
# Clean the text to ensure it's plaintext
|
||||||
|
text_content = clean_text(text_content)
|
||||||
|
|
||||||
|
# Create processed directory structure if it doesn't exist
|
||||||
|
processed_dir = Path("processed") / document_name
|
||||||
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save the text content to a file
|
||||||
|
text_file = processed_dir / f"{document_name}_text_content.md"
|
||||||
|
with open(text_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(text_content)
|
||||||
|
|
||||||
|
print(f"✅ Text content extracted and saved to: {text_file}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'text_content': text_content,
|
||||||
|
'text_file': text_file,
|
||||||
|
'processed_dir': processed_dir
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Docling failed: {e}")
|
||||||
|
print("🔄 Trying PyMuPDF fallback...")
|
||||||
|
|
||||||
|
# Fallback to PyMuPDF
|
||||||
|
try:
|
||||||
|
text_content = extract_text_with_pymupdf(pdf_path)
|
||||||
|
|
||||||
|
if text_content:
|
||||||
|
# Clean the text to ensure it's plaintext
|
||||||
|
text_content = clean_text(text_content)
|
||||||
|
|
||||||
|
# Create processed directory structure if it doesn't exist
|
||||||
|
processed_dir = Path("processed") / document_name
|
||||||
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save the text content to a file
|
||||||
|
text_file = processed_dir / f"{document_name}_text_content.md"
|
||||||
|
with open(text_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(text_content)
|
||||||
|
|
||||||
|
print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'text_content': text_content,
|
||||||
|
'text_file': text_file,
|
||||||
|
'processed_dir': processed_dir
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print("⚠️ PyMuPDF fallback also failed")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as fallback_error:
|
||||||
|
print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_with_pymupdf(pdf_path):
|
||||||
|
"""Extract text using PyMuPDF as fallback with clean formatting"""
|
||||||
|
try:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
text_content = ""
|
||||||
|
|
||||||
|
for page_num in range(len(doc)):
|
||||||
|
page = doc[page_num]
|
||||||
|
|
||||||
|
# Extract text with better formatting
|
||||||
|
page_text = page.get_text()
|
||||||
|
|
||||||
|
# Clean the page text
|
||||||
|
page_text = clean_text(page_text)
|
||||||
|
|
||||||
|
# Add page separator
|
||||||
|
text_content += f"\n--- Page {page_num + 1} ---\n"
|
||||||
|
text_content += page_text
|
||||||
|
text_content += "\n"
|
||||||
|
|
||||||
|
doc.close()
|
||||||
|
return text_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"PyMuPDF extraction failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_slide_text_content(text_content, slide_num):
|
||||||
|
"""Extract text content for a specific slide from the full document text"""
|
||||||
|
try:
|
||||||
|
if not text_content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Split by page separators
|
||||||
|
pages = text_content.split('--- Page')
|
||||||
|
|
||||||
|
# Find the page for this slide
|
||||||
|
target_page = None
|
||||||
|
for page in pages:
|
||||||
|
if page.strip().startswith(f" {slide_num} ---"):
|
||||||
|
target_page = page
|
||||||
|
break
|
||||||
|
|
||||||
|
if target_page:
|
||||||
|
# Remove the page header and clean up
|
||||||
|
lines = target_page.split('\n')[1:] # Remove page header
|
||||||
|
slide_text = '\n'.join(lines).strip()
|
||||||
|
|
||||||
|
# Further clean the slide text
|
||||||
|
slide_text = clean_text(slide_text)
|
||||||
|
|
||||||
|
return slide_text
|
||||||
|
else:
|
||||||
|
# Fallback: try to extract from sections
|
||||||
|
sections = text_content.split('\n\n')
|
||||||
|
if slide_num <= len(sections):
|
||||||
|
return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
|
||||||
|
else:
|
||||||
|
# Return a portion of the text content
|
||||||
|
lines = text_content.split('\n')
|
||||||
|
start_line = (slide_num - 1) * 5 # Approximate 5 lines per slide
|
||||||
|
end_line = min(start_line + 10, len(lines)) # Up to 10 lines
|
||||||
|
slide_text = '\n'.join(lines[start_line:end_line])
|
||||||
|
return clean_text(slide_text)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error extracting text for slide {slide_num}: {e}")
|
||||||
|
return f"[Text content for slide {slide_num} could not be extracted]"
|
||||||
|
|
@ -0,0 +1,199 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
Document-specific validator that organizes reports by document in processed directory
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from .rag_agent import MarketCapRAGAgent
|
||||||
|
from .validation_report import ValidationReportGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentValidator:
|
||||||
|
"""
|
||||||
|
Validates financial claims for specific documents with proper directory organization
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None):
|
||||||
|
self.rag_agent = MarketCapRAGAgent(api_key)
|
||||||
|
self.report_generator = ValidationReportGenerator()
|
||||||
|
|
||||||
|
def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]],
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Validate financial claims for a specific document
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_name: Name of the document (e.g., "Uber-Pitch-Deck")
|
||||||
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
print(f"🔍 Validating financial claims for: {document_name}")
|
||||||
|
|
||||||
|
# Extract and validate claims
|
||||||
|
validation_results = self.rag_agent.validate_all_claims(slide_texts)
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
report = self.report_generator.generate_report(validation_results, slide_texts)
|
||||||
|
|
||||||
|
# Save report in proper directory structure
|
||||||
|
report_filename = None
|
||||||
|
if save_report:
|
||||||
|
# Create document-specific directory
|
||||||
|
doc_dir = os.path.join("processed", document_name)
|
||||||
|
os.makedirs(doc_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Save report in document directory
|
||||||
|
report_filename = self.report_generator.save_report(
|
||||||
|
report,
|
||||||
|
f"{document_name}_market_cap_validation.md",
|
||||||
|
doc_dir
|
||||||
|
)
|
||||||
|
print(f"📄 Validation report saved to: {report_filename}")
|
||||||
|
|
||||||
|
# Prepare summary
|
||||||
|
summary = self._generate_summary(validation_results)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'document_name': document_name,
|
||||||
|
'validation_results': validation_results,
|
||||||
|
'report': report,
|
||||||
|
'report_filename': report_filename,
|
||||||
|
'summary': summary
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Validate all documents in the processed folder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_path: Path to processed folder
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with results for each document
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
if not os.path.exists(folder_path):
|
||||||
|
raise ValueError(f"Processed folder not found: {folder_path}")
|
||||||
|
|
||||||
|
# Find all document directories
|
||||||
|
for item in os.listdir(folder_path):
|
||||||
|
item_path = os.path.join(folder_path, item)
|
||||||
|
if os.path.isdir(item_path) and not item.startswith('.'):
|
||||||
|
# Look for text content files
|
||||||
|
text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
|
||||||
|
|
||||||
|
if text_files:
|
||||||
|
document_name = item
|
||||||
|
text_file = os.path.join(item_path, text_files[0])
|
||||||
|
|
||||||
|
print(f"📁 Processing document: {document_name}")
|
||||||
|
|
||||||
|
# Read text content
|
||||||
|
with open(text_file, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Convert to slide format
|
||||||
|
slide_texts = [{
|
||||||
|
"slide_number": 1,
|
||||||
|
"text": content
|
||||||
|
}]
|
||||||
|
|
||||||
|
# Validate document
|
||||||
|
try:
|
||||||
|
doc_results = self.validate_document(document_name, slide_texts)
|
||||||
|
results[document_name] = doc_results
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error processing {document_name}: {e}")
|
||||||
|
results[document_name] = {'error': str(e)}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
|
||||||
|
"""Generate a summary of validation results"""
|
||||||
|
total_claims = len(validation_results)
|
||||||
|
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
|
||||||
|
inaccurate_claims = total_claims - accurate_claims
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_claims': total_claims,
|
||||||
|
'accurate_claims': accurate_claims,
|
||||||
|
'inaccurate_claims': inaccurate_claims,
|
||||||
|
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
|
||||||
|
'claims_by_slide': self._group_claims_by_slide(validation_results)
|
||||||
|
}
|
||||||
|
|
||||||
|
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
|
||||||
|
"""Group claims by slide number"""
|
||||||
|
claims_by_slide = {}
|
||||||
|
for result in validation_results:
|
||||||
|
slide_num = result.claim.slide_number
|
||||||
|
if slide_num not in claims_by_slide:
|
||||||
|
claims_by_slide[slide_num] = []
|
||||||
|
claims_by_slide[slide_num].append(result)
|
||||||
|
return claims_by_slide
|
||||||
|
|
||||||
|
|
||||||
|
def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]],
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convenience function to validate claims for a specific document
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_name: Name of the document
|
||||||
|
slide_texts: List of slide data
|
||||||
|
api_key: OpenRouter API key (optional)
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
validator = DocumentValidator(api_key)
|
||||||
|
return validator.validate_document(document_name, slide_texts, save_report)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_all_processed_documents(folder_path: str = "processed",
|
||||||
|
api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convenience function to validate all documents in processed folder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_path: Path to processed folder
|
||||||
|
api_key: OpenRouter API key (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with results for each document
|
||||||
|
"""
|
||||||
|
validator = DocumentValidator(api_key)
|
||||||
|
return validator.validate_from_processed_folder(folder_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
print("Document Validator - RAG Agent")
|
||||||
|
print("===============================")
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = validate_all_processed_documents()
|
||||||
|
|
||||||
|
print(f"\n✅ Validation Complete!")
|
||||||
|
print(f"📊 Processed {len(results)} documents:")
|
||||||
|
|
||||||
|
for doc_name, doc_results in results.items():
|
||||||
|
if 'error' in doc_results:
|
||||||
|
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||||
|
else:
|
||||||
|
summary = doc_results['summary']
|
||||||
|
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||||
|
if doc_results['report_filename']:
|
||||||
|
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
|
@ -0,0 +1,111 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def detect_file_type(file_path):
|
||||||
|
"""Detect file type based on extension"""
|
||||||
|
file_ext = Path(file_path).suffix.lower()
|
||||||
|
|
||||||
|
file_types = {
|
||||||
|
'.pdf': 'pdf',
|
||||||
|
'.pptx': 'powerpoint',
|
||||||
|
'.ppt': 'powerpoint',
|
||||||
|
'.docx': 'word',
|
||||||
|
'.doc': 'word',
|
||||||
|
'.odp': 'openoffice_presentation',
|
||||||
|
'.odt': 'openoffice_document'
|
||||||
|
}
|
||||||
|
|
||||||
|
return file_types.get(file_ext, 'unknown')
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pdf(input_file, output_dir, document_name):
|
||||||
|
"""Convert various file types to PDF"""
|
||||||
|
file_type = detect_file_type(input_file)
|
||||||
|
|
||||||
|
if file_type == 'pdf':
|
||||||
|
print("✅ File is already PDF, no conversion needed")
|
||||||
|
return input_file
|
||||||
|
|
||||||
|
print(f"🔄 Converting {file_type} file to PDF...")
|
||||||
|
|
||||||
|
# Create temporary PDF file
|
||||||
|
temp_pdf = output_dir + "/" + f"{document_name}_temp.pdf"
|
||||||
|
|
||||||
|
try:
|
||||||
|
if file_type == 'powerpoint':
|
||||||
|
# Convert PowerPoint to PDF using pptxtopdf
|
||||||
|
print(" Using pptxtopdf for PowerPoint conversion...")
|
||||||
|
result = subprocess.run([
|
||||||
|
'python', '-c',
|
||||||
|
f'import pptxtopdf; pptxtopdf.convert("{input_file}", "{temp_pdf}")'
|
||||||
|
], capture_output=True, text=True, timeout=60)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"⚠️ pptxtopdf failed: {result.stderr}")
|
||||||
|
# Fallback: try using LibreOffice
|
||||||
|
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||||
|
|
||||||
|
elif file_type in ['word', 'openoffice_document']:
|
||||||
|
# Convert Word documents using LibreOffice
|
||||||
|
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||||
|
|
||||||
|
elif file_type == 'openoffice_presentation':
|
||||||
|
# Convert OpenOffice presentations using LibreOffice
|
||||||
|
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"❌ Unsupported file type: {file_type}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if temp_pdf.exists():
|
||||||
|
print(f"✅ Successfully converted to PDF: {temp_pdf}")
|
||||||
|
return str(temp_pdf)
|
||||||
|
else:
|
||||||
|
print("❌ Conversion failed - PDF file not created")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("❌ Conversion timed out")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Conversion error: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def convert_with_libreoffice(input_file, output_pdf, file_type):
|
||||||
|
"""Convert files using LibreOffice as fallback"""
|
||||||
|
try:
|
||||||
|
print(f" Using LibreOffice for {file_type} conversion...")
|
||||||
|
|
||||||
|
# LibreOffice command
|
||||||
|
cmd = [
|
||||||
|
'soffice', '--headless', '--convert-to', 'pdf',
|
||||||
|
'--outdir', str(output_pdf.parent),
|
||||||
|
str(input_file)
|
||||||
|
]
|
||||||
|
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
# LibreOffice creates PDF with same name as input
|
||||||
|
input_name = Path(input_file).stem
|
||||||
|
libreoffice_pdf = os.path.dirname(output_pdf) + "/" + f"{input_name}.pdf"
|
||||||
|
|
||||||
|
if libreoffice_pdf.exists():
|
||||||
|
# Rename to our expected temp name
|
||||||
|
libreoffice_pdf.rename(output_pdf)
|
||||||
|
print(f"✅ LibreOffice conversion successful: {output_pdf}")
|
||||||
|
return str(output_pdf)
|
||||||
|
|
||||||
|
print(f"⚠️ LibreOffice conversion failed: {result.stderr}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("❌ LibreOffice conversion timed out")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ LibreOffice conversion error: {e}")
|
||||||
|
return None
|
||||||
|
|
@ -0,0 +1,173 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def clean_markdown_text(text):
|
||||||
|
"""Clean markdown text to ensure it's plaintext with no special characters"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove LaTeX commands and math expressions
|
||||||
|
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
|
||||||
|
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
|
||||||
|
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
|
||||||
|
|
||||||
|
# Remove markdown formatting but keep the text
|
||||||
|
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Remove bold **text**
|
||||||
|
text = re.sub(r'\*([^*]+)\*', r'\1', text) # Remove italic *text*
|
||||||
|
text = re.sub(r'`([^`]+)`', r'\1', text) # Remove code `text`
|
||||||
|
text = re.sub(r'#{1,6}\s*', '', text) # Remove headers # ## ###
|
||||||
|
|
||||||
|
# Remove special characters but keep basic punctuation
|
||||||
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
|
||||||
|
|
||||||
|
# Clean up multiple spaces and newlines
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
|
||||||
|
"""Create markdown content for a single slide with all agentic analyses and text content"""
|
||||||
|
|
||||||
|
markdown = f"""# Slide {slide_num}
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Add text content if available
|
||||||
|
if slide_text and slide_text.strip():
|
||||||
|
# Clean the slide text to ensure it's plaintext
|
||||||
|
clean_slide_text = clean_markdown_text(slide_text)
|
||||||
|
markdown += f"""## Text Content
|
||||||
|
|
||||||
|
{clean_slide_text}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
markdown += """## Agentic Analysis
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
for prompt_key, result in analysis_results.items():
|
||||||
|
# Clean the analysis text to ensure it's plaintext
|
||||||
|
clean_analysis = clean_markdown_text(result['analysis'])
|
||||||
|
|
||||||
|
markdown += f"""### {result['agent']}
|
||||||
|
|
||||||
|
{clean_analysis}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
markdown += "---\n\n"
|
||||||
|
return markdown
|
||||||
|
|
||||||
|
|
||||||
|
def create_text_only_markdown(markdown_content):
|
||||||
|
"""Create a text-only version of markdown without image references for API submission"""
|
||||||
|
# Remove image markdown blocks but keep the text descriptions and analysis
|
||||||
|
text_only = markdown_content
|
||||||
|
|
||||||
|
# Remove image embedding lines
|
||||||
|
text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)
|
||||||
|
|
||||||
|
# Remove image link lines
|
||||||
|
text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)
|
||||||
|
|
||||||
|
# Remove horizontal rules that were added for slide separation
|
||||||
|
text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# Clean up extra newlines
|
||||||
|
text_only = re.sub(r'\n{3,}', '\n\n', text_only)
|
||||||
|
|
||||||
|
# Apply final text cleaning to ensure plaintext
|
||||||
|
text_only = clean_markdown_text(text_only)
|
||||||
|
|
||||||
|
return text_only.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def send_to_api_and_get_haste_link(markdown_content, document_title):
|
||||||
|
"""Send markdown to API and get both raw markdown and HTML URLs"""
|
||||||
|
try:
|
||||||
|
print("Sending to API for URLs...")
|
||||||
|
|
||||||
|
# Create text-only version for API
|
||||||
|
text_only_markdown = create_text_only_markdown(markdown_content)
|
||||||
|
|
||||||
|
# First, send raw markdown to haste.nixc.us
|
||||||
|
raw_haste_url = None
|
||||||
|
try:
|
||||||
|
print(" 📝 Creating raw markdown URL...")
|
||||||
|
raw_response = requests.post(
|
||||||
|
"https://haste.nixc.us/documents",
|
||||||
|
data=text_only_markdown.encode('utf-8'),
|
||||||
|
headers={"Content-Type": "text/plain"},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if raw_response.status_code == 200:
|
||||||
|
raw_token = raw_response.text.strip().strip('"')
|
||||||
|
# Extract just the token from JSON response if needed
|
||||||
|
if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
|
||||||
|
import json
|
||||||
|
try:
|
||||||
|
token_data = json.loads(raw_token)
|
||||||
|
raw_token = token_data['key']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raw_haste_url = f"https://haste.nixc.us/{raw_token}"
|
||||||
|
print(f" ✅ Raw markdown URL created")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Raw markdown upload failed with status {raw_response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Failed to create raw markdown URL: {e}")
|
||||||
|
|
||||||
|
# Then, send to md.colinknapp.com for HTML version
|
||||||
|
html_url = None
|
||||||
|
try:
|
||||||
|
print(" 🎨 Creating HTML version URL...")
|
||||||
|
api_data = {
|
||||||
|
"markdown": text_only_markdown,
|
||||||
|
"format": "html",
|
||||||
|
"template": "playful",
|
||||||
|
"title": f"Pitch Deck Analysis: {document_title}",
|
||||||
|
"subtitle": "AI-Generated Analysis with Agentic Insights",
|
||||||
|
"contact": "Generated by Pitch Deck Parser",
|
||||||
|
"send_to_haste": True
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
"https://md.colinknapp.com/api/convert",
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
data=json.dumps(api_data),
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
if 'haste_url' in result:
|
||||||
|
# Extract token from haste_url and format as requested
|
||||||
|
haste_url = result['haste_url']
|
||||||
|
if 'haste.nixc.us/' in haste_url:
|
||||||
|
token = haste_url.split('haste.nixc.us/')[-1]
|
||||||
|
html_url = f"https://md.colinknapp.com/haste/{token}"
|
||||||
|
else:
|
||||||
|
html_url = haste_url
|
||||||
|
print(f" ✅ HTML version URL created")
|
||||||
|
else:
|
||||||
|
print(" ⚠️ API response missing haste_url")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ HTML API request failed with status {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Failed to create HTML URL: {e}")
|
||||||
|
|
||||||
|
return raw_haste_url, html_url
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Failed to send to API: {e}")
|
||||||
|
return None, None
|
||||||
|
|
@ -0,0 +1,235 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
Market Cap Validator - Main Interface
|
||||||
|
|
||||||
|
This module provides a simple interface to validate market cap claims
|
||||||
|
from pitch deck slides using RAG search capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from .rag_agent import MarketCapRAGAgent
|
||||||
|
from .validation_report import ValidationReportGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class MarketCapValidator:
|
||||||
|
"""
|
||||||
|
Main interface for market cap validation using RAG search
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Initialize the market cap validator
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key: OpenRouter API key (if not provided, will use environment variable)
|
||||||
|
"""
|
||||||
|
self.rag_agent = MarketCapRAGAgent(api_key)
|
||||||
|
self.report_generator = ValidationReportGenerator()
|
||||||
|
|
||||||
|
def validate_from_slides(self, slide_texts: List[Dict[str, Any]],
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Validate market cap claims from slide text exports
|
||||||
|
|
||||||
|
Args:
|
||||||
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
print("🔍 Starting market cap validation process...")
|
||||||
|
|
||||||
|
# Extract and validate claims
|
||||||
|
validation_results = self.rag_agent.validate_all_claims(slide_texts)
|
||||||
|
|
||||||
|
# Generate report
|
||||||
|
report = self.report_generator.generate_report(validation_results, slide_texts)
|
||||||
|
|
||||||
|
# Save report if requested
|
||||||
|
report_filename = None
|
||||||
|
if save_report:
|
||||||
|
report_filename = self.report_generator.save_report(report)
|
||||||
|
print(f"📄 Validation report saved to: {report_filename}")
|
||||||
|
|
||||||
|
# Prepare summary
|
||||||
|
summary = self._generate_summary(validation_results)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'validation_results': validation_results,
|
||||||
|
'report': report,
|
||||||
|
'report_filename': report_filename,
|
||||||
|
'summary': summary
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate_from_file(self, file_path: str, save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Validate market cap claims from a JSON file containing slide texts
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to JSON file with slide data
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
slide_texts = json.load(f)
|
||||||
|
|
||||||
|
print(f"📁 Loaded slide data from: {file_path}")
|
||||||
|
return self.validate_from_slides(slide_texts, save_report)
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Invalid JSON file: {e}")
|
||||||
|
|
||||||
|
def validate_from_processed_folder(self, folder_path: str = "processed",
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Validate market cap claims from processed slide files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_path: Path to folder containing processed slide files
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
slide_texts = []
|
||||||
|
|
||||||
|
# Look for JSON files in the processed folder
|
||||||
|
if os.path.exists(folder_path):
|
||||||
|
for filename in os.listdir(folder_path):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
try:
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Handle different JSON structures
|
||||||
|
if isinstance(data, list):
|
||||||
|
slide_texts.extend(data)
|
||||||
|
elif isinstance(data, dict) and 'slides' in data:
|
||||||
|
slide_texts.extend(data['slides'])
|
||||||
|
elif isinstance(data, dict) and 'text' in data:
|
||||||
|
slide_texts.append(data)
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, KeyError) as e:
|
||||||
|
print(f"⚠️ Skipping invalid file {filename}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not slide_texts:
|
||||||
|
raise ValueError(f"No valid slide data found in {folder_path}")
|
||||||
|
|
||||||
|
print(f"📁 Loaded {len(slide_texts)} slides from processed folder")
|
||||||
|
return self.validate_from_slides(slide_texts, save_report)
|
||||||
|
|
||||||
|
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
|
||||||
|
"""Generate a summary of validation results"""
|
||||||
|
total_claims = len(validation_results)
|
||||||
|
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
|
||||||
|
inaccurate_claims = total_claims - accurate_claims
|
||||||
|
|
||||||
|
return {
|
||||||
|
'total_claims': total_claims,
|
||||||
|
'accurate_claims': accurate_claims,
|
||||||
|
'inaccurate_claims': inaccurate_claims,
|
||||||
|
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
|
||||||
|
'claims_by_slide': self._group_claims_by_slide(validation_results)
|
||||||
|
}
|
||||||
|
|
||||||
|
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
|
||||||
|
"""Group claims by slide number"""
|
||||||
|
claims_by_slide = {}
|
||||||
|
for result in validation_results:
|
||||||
|
slide_num = result.claim.slide_number
|
||||||
|
if slide_num not in claims_by_slide:
|
||||||
|
claims_by_slide[slide_num] = []
|
||||||
|
claims_by_slide[slide_num].append(result)
|
||||||
|
return claims_by_slide
|
||||||
|
|
||||||
|
|
||||||
|
def validate_market_caps(slide_texts: List[Dict[str, Any]],
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convenience function to validate market cap claims
|
||||||
|
|
||||||
|
Args:
|
||||||
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||||
|
api_key: OpenRouter API key (optional)
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
validator = MarketCapValidator(api_key)
|
||||||
|
return validator.validate_from_slides(slide_texts, save_report)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_market_caps_from_file(file_path: str,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convenience function to validate market cap claims from a file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to JSON file with slide data
|
||||||
|
api_key: OpenRouter API key (optional)
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
validator = MarketCapValidator(api_key)
|
||||||
|
return validator.validate_from_file(file_path, save_report)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_market_caps_from_processed(folder_path: str = "processed",
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
save_report: bool = True) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Convenience function to validate market cap claims from processed folder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
folder_path: Path to folder containing processed slide files
|
||||||
|
api_key: OpenRouter API key (optional)
|
||||||
|
save_report: Whether to save the validation report to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing validation results and report
|
||||||
|
"""
|
||||||
|
validator = MarketCapValidator(api_key)
|
||||||
|
return validator.validate_from_processed_folder(folder_path, save_report)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
print("Market Cap Validator - RAG Agent")
|
||||||
|
print("=================================")
|
||||||
|
|
||||||
|
# Try to validate from processed folder
|
||||||
|
try:
|
||||||
|
results = validate_market_caps_from_processed()
|
||||||
|
|
||||||
|
print(f"\n✅ Validation Complete!")
|
||||||
|
print(f"📊 Summary:")
|
||||||
|
print(f" - Total Claims: {results['summary']['total_claims']}")
|
||||||
|
print(f" - Accurate: {results['summary']['accurate_claims']}")
|
||||||
|
print(f" - Inaccurate: {results['summary']['inaccurate_claims']}")
|
||||||
|
print(f" - Accuracy Rate: {results['summary']['accuracy_rate']:.1f}%")
|
||||||
|
|
||||||
|
if results['report_filename']:
|
||||||
|
print(f"📄 Report saved to: {results['report_filename']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
print("\nUsage examples:")
|
||||||
|
print("1. Place slide data JSON files in 'processed/' folder")
|
||||||
|
print("2. Run: python -m modules.market_cap_validator")
|
||||||
|
print("3. Or use the functions directly in your code")
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import fitz # PyMuPDF for PDF processing
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def extract_slides_from_pdf(pdf_path, output_dir, document_name):
|
||||||
|
"""Extract individual slides from PDF as images"""
|
||||||
|
print(f"Extracting slides from PDF: {pdf_path}")
|
||||||
|
|
||||||
|
# Create processed directory structure: ./processed/DocumentName/
|
||||||
|
processed_dir = Path("processed") / document_name
|
||||||
|
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create slides directory within processed directory
|
||||||
|
slides_dir = processed_dir / "slides"
|
||||||
|
slides_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
slides = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Open PDF with PyMuPDF
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
|
||||||
|
for page_num in range(len(pdf_document)):
|
||||||
|
page = pdf_document[page_num]
|
||||||
|
|
||||||
|
# Convert page to image (high resolution)
|
||||||
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
|
||||||
|
# Save as PNG with document name prefix
|
||||||
|
slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
|
||||||
|
slide_path = slides_dir / slide_filename
|
||||||
|
|
||||||
|
pix.save(str(slide_path))
|
||||||
|
|
||||||
|
# Convert to base64 for API
|
||||||
|
img_data = pix.tobytes("png")
|
||||||
|
img_base64 = base64.b64encode(img_data).decode('utf-8')
|
||||||
|
|
||||||
|
slides.append({
|
||||||
|
'page_num': page_num + 1,
|
||||||
|
'filename': slide_filename,
|
||||||
|
'path': slide_path,
|
||||||
|
'base64': img_base64,
|
||||||
|
'document_name': document_name,
|
||||||
|
'processed_dir': processed_dir
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f" Extracted slide {page_num + 1}")
|
||||||
|
|
||||||
|
pdf_document.close()
|
||||||
|
print(f"✅ Extracted {len(slides)} slides")
|
||||||
|
return slides
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error extracting slides: {e}")
|
||||||
|
return []
|
||||||
|
|
@ -0,0 +1,286 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from .client import get_openrouter_client
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MarketCapClaim:
|
||||||
|
"""Represents a market cap claim found in slide text"""
|
||||||
|
slide_number: int
|
||||||
|
company_name: str
|
||||||
|
claimed_market_cap: str
|
||||||
|
raw_text: str
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ValidationResult:
|
||||||
|
"""Represents the validation result for a market cap claim"""
|
||||||
|
claim: MarketCapClaim
|
||||||
|
validated_market_cap: Optional[str]
|
||||||
|
validation_source: str
|
||||||
|
confidence_score: float
|
||||||
|
is_accurate: bool
|
||||||
|
discrepancy: Optional[str]
|
||||||
|
rag_search_query: str
|
||||||
|
rag_response: str
|
||||||
|
|
||||||
|
|
||||||
|
class MarketCapRAGAgent:
|
||||||
|
"""
|
||||||
|
RAG Agent for validating market cap claims from pitch deck slides
|
||||||
|
using OpenRouter's web search capabilities
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, api_key: Optional[str] = None):
|
||||||
|
self.client = get_openrouter_client()
|
||||||
|
self.market_cap_patterns = [
|
||||||
|
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||||
|
r'valuation\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||||
|
r'worth\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||||
|
r'valued\s+at\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||||
|
r'\$([0-9,.]+[BMK]?)\s+(?:market\s+cap|valuation)',
|
||||||
|
r'(?:market\s+cap|valuation)\s+of\s+\$?([0-9,.]+[BMK]?)'
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract_market_cap_claims(self, slide_texts: List[Dict[str, Any]]) -> List[MarketCapClaim]:
|
||||||
|
"""
|
||||||
|
Extract market cap claims from slide text exports
|
||||||
|
|
||||||
|
Args:
|
||||||
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MarketCapClaim objects
|
||||||
|
"""
|
||||||
|
claims = []
|
||||||
|
|
||||||
|
for slide_data in slide_texts:
|
||||||
|
slide_number = slide_data.get('slide_number', 0)
|
||||||
|
text = slide_data.get('text', '')
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract company name (usually in first few lines or title)
|
||||||
|
company_name = self._extract_company_name(text)
|
||||||
|
|
||||||
|
# Search for market cap patterns
|
||||||
|
for pattern in self.market_cap_patterns:
|
||||||
|
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
claimed_value = match.group(1)
|
||||||
|
raw_text = match.group(0)
|
||||||
|
|
||||||
|
# Calculate confidence based on context
|
||||||
|
confidence = self._calculate_confidence(text, match.start(), match.end())
|
||||||
|
|
||||||
|
claim = MarketCapClaim(
|
||||||
|
slide_number=slide_number,
|
||||||
|
company_name=company_name,
|
||||||
|
claimed_market_cap=claimed_value,
|
||||||
|
raw_text=raw_text,
|
||||||
|
confidence=confidence
|
||||||
|
)
|
||||||
|
claims.append(claim)
|
||||||
|
|
||||||
|
return claims
|
||||||
|
|
||||||
|
def _extract_company_name(self, text: str) -> str:
|
||||||
|
"""Extract company name from slide text"""
|
||||||
|
lines = text.split('\n')[:5] # Check first 5 lines
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and len(line) > 2 and len(line) < 100:
|
||||||
|
# Skip common slide headers
|
||||||
|
if not any(header in line.lower() for header in ['slide', 'page', 'agenda', 'overview']):
|
||||||
|
return line
|
||||||
|
|
||||||
|
return "Unknown Company"
|
||||||
|
|
||||||
|
def _calculate_confidence(self, text: str, start: int, end: int) -> float:
|
||||||
|
"""Calculate confidence score for a market cap claim"""
|
||||||
|
confidence = 0.5 # Base confidence
|
||||||
|
|
||||||
|
# Extract context around the match
|
||||||
|
context_start = max(0, start - 50)
|
||||||
|
context_end = min(len(text), end + 50)
|
||||||
|
context = text[context_start:context_end].lower()
|
||||||
|
|
||||||
|
# Increase confidence for specific indicators
|
||||||
|
if any(indicator in context for indicator in ['current', 'latest', 'as of', '2024', '2025']):
|
||||||
|
confidence += 0.2
|
||||||
|
|
||||||
|
if any(indicator in context for indicator in ['billion', 'million', 'trillion']):
|
||||||
|
confidence += 0.1
|
||||||
|
|
||||||
|
if 'market cap' in context or 'valuation' in context:
|
||||||
|
confidence += 0.2
|
||||||
|
|
||||||
|
return min(confidence, 1.0)
|
||||||
|
|
||||||
|
def validate_claim_with_rag(self, claim: MarketCapClaim) -> ValidationResult:
|
||||||
|
"""
|
||||||
|
Validate a market cap claim using RAG search
|
||||||
|
|
||||||
|
Args:
|
||||||
|
claim: MarketCapClaim to validate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ValidationResult with validation details
|
||||||
|
"""
|
||||||
|
# Construct RAG search query
|
||||||
|
search_query = f"{claim.company_name} current market cap valuation 2024 2025"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use OpenRouter with online search enabled
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model="mistralai/mistral-small",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"""
|
||||||
|
Please search for the current market cap or valuation of {claim.company_name}.
|
||||||
|
|
||||||
|
The company claims their market cap is ${claim.claimed_market_cap}.
|
||||||
|
|
||||||
|
Please provide:
|
||||||
|
1. The current market cap/valuation if found
|
||||||
|
2. The source of this information
|
||||||
|
3. Whether the claimed value appears accurate
|
||||||
|
4. Any significant discrepancies
|
||||||
|
|
||||||
|
Focus on recent data from 2024-2025.
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens=800
|
||||||
|
)
|
||||||
|
|
||||||
|
rag_response = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
# Parse the response to extract validation details
|
||||||
|
validation_details = self._parse_rag_response(rag_response, claim)
|
||||||
|
|
||||||
|
return ValidationResult(
|
||||||
|
claim=claim,
|
||||||
|
validated_market_cap=validation_details.get('validated_cap'),
|
||||||
|
validation_source=validation_details.get('source', 'RAG Search'),
|
||||||
|
confidence_score=validation_details.get('confidence', 0.5),
|
||||||
|
is_accurate=validation_details.get('is_accurate', False),
|
||||||
|
discrepancy=validation_details.get('discrepancy'),
|
||||||
|
rag_search_query=search_query,
|
||||||
|
rag_response=rag_response
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return ValidationResult(
|
||||||
|
claim=claim,
|
||||||
|
validated_market_cap=None,
|
||||||
|
validation_source="Error",
|
||||||
|
confidence_score=0.0,
|
||||||
|
is_accurate=False,
|
||||||
|
discrepancy=f"RAG search failed: {str(e)}",
|
||||||
|
rag_search_query=search_query,
|
||||||
|
rag_response=f"Error: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_rag_response(self, response: str, claim: MarketCapClaim) -> Dict[str, Any]:
|
||||||
|
"""Parse RAG response to extract validation details"""
|
||||||
|
details = {
|
||||||
|
'validated_cap': None,
|
||||||
|
'source': 'RAG Search',
|
||||||
|
'confidence': 0.5,
|
||||||
|
'is_accurate': False,
|
||||||
|
'discrepancy': None
|
||||||
|
}
|
||||||
|
|
||||||
|
response_lower = response.lower()
|
||||||
|
|
||||||
|
# Look for market cap values in the response
|
||||||
|
cap_patterns = [
|
||||||
|
r'\$([0-9,.]+[BMK]?)',
|
||||||
|
r'([0-9,.]+[BMK]?)\s+(?:billion|million|trillion)',
|
||||||
|
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)'
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in cap_patterns:
|
||||||
|
matches = re.findall(pattern, response_lower)
|
||||||
|
if matches:
|
||||||
|
details['validated_cap'] = matches[0]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine accuracy
|
||||||
|
if details['validated_cap']:
|
||||||
|
claimed_normalized = self._normalize_value(claim.claimed_market_cap)
|
||||||
|
validated_normalized = self._normalize_value(details['validated_cap'])
|
||||||
|
|
||||||
|
if claimed_normalized and validated_normalized:
|
||||||
|
# Allow for some variance (within 20%)
|
||||||
|
ratio = min(claimed_normalized, validated_normalized) / max(claimed_normalized, validated_normalized)
|
||||||
|
details['is_accurate'] = ratio > 0.8
|
||||||
|
|
||||||
|
if not details['is_accurate']:
|
||||||
|
details['discrepancy'] = f"Claimed: ${claim.claimed_market_cap}, Found: ${details['validated_cap']}"
|
||||||
|
|
||||||
|
# Extract source information
|
||||||
|
if 'source:' in response_lower or 'according to' in response_lower:
|
||||||
|
source_match = re.search(r'(?:source:|according to)\s*([^\n]+)', response_lower)
|
||||||
|
if source_match:
|
||||||
|
details['source'] = source_match.group(1).strip()
|
||||||
|
|
||||||
|
return details
|
||||||
|
|
||||||
|
def _normalize_value(self, value: str) -> Optional[float]:
|
||||||
|
"""Normalize market cap value to a comparable number"""
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
|
||||||
|
value = value.replace(',', '').upper()
|
||||||
|
|
||||||
|
multiplier = 1
|
||||||
|
if value.endswith('B'):
|
||||||
|
multiplier = 1_000_000_000
|
||||||
|
value = value[:-1]
|
||||||
|
elif value.endswith('M'):
|
||||||
|
multiplier = 1_000_000
|
||||||
|
value = value[:-1]
|
||||||
|
elif value.endswith('K'):
|
||||||
|
multiplier = 1_000
|
||||||
|
value = value[:-1]
|
||||||
|
elif value.endswith('T'):
|
||||||
|
multiplier = 1_000_000_000_000
|
||||||
|
value = value[:-1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
return float(value) * multiplier
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def validate_all_claims(self, slide_texts: List[Dict[str, Any]]) -> List[ValidationResult]:
|
||||||
|
"""
|
||||||
|
Extract and validate all market cap claims from slide texts
|
||||||
|
|
||||||
|
Args:
|
||||||
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ValidationResult objects
|
||||||
|
"""
|
||||||
|
claims = self.extract_market_cap_claims(slide_texts)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
print(f"Found {len(claims)} market cap claims to validate...")
|
||||||
|
|
||||||
|
for i, claim in enumerate(claims, 1):
|
||||||
|
print(f" Validating claim {i}/{len(claims)}: {claim.company_name} - ${claim.claimed_market_cap}")
|
||||||
|
result = self.validate_claim_with_rag(claim)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
pdf2image
|
||||||
|
openai
|
||||||
|
requests
|
||||||
|
PyMuPDF
|
||||||
|
docling
|
||||||
|
python-dotenv
|
||||||
|
|
@ -0,0 +1,129 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
Clean Market Cap Validation CLI
|
||||||
|
|
||||||
|
Validates market cap claims from pitch deck slides using RAG search.
|
||||||
|
Reports are automatically organized in the processed/ directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
from modules.document_validator import (
|
||||||
|
validate_document_claims,
|
||||||
|
validate_all_processed_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Validate market cap claims from pitch deck slides using RAG search"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--file', '-f',
|
||||||
|
help='Path to JSON file containing slide data'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--document', '-d',
|
||||||
|
help='Document name for organized reporting'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--all',
|
||||||
|
action='store_true',
|
||||||
|
help='Validate all documents in processed/ folder'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--no-save',
|
||||||
|
action='store_true',
|
||||||
|
help='Do not save validation report to file'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--api-key',
|
||||||
|
help='OpenRouter API key (or set OPENROUTER_API_KEY environment variable)'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get API key
|
||||||
|
api_key = args.api_key or os.getenv('OPENROUTER_API_KEY')
|
||||||
|
if not api_key:
|
||||||
|
print("❌ Error: OpenRouter API key required")
|
||||||
|
print(" Set OPENROUTER_API_KEY environment variable or use --api-key")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("🔍 Market Cap Validation with RAG Search")
|
||||||
|
print("=========================================")
|
||||||
|
|
||||||
|
if args.all:
|
||||||
|
print("📁 Validating all documents in processed/ folder")
|
||||||
|
results = validate_all_processed_documents(api_key=api_key)
|
||||||
|
|
||||||
|
print(f"\n✅ Validation Complete!")
|
||||||
|
print(f"📊 Processed {len(results)} documents:")
|
||||||
|
|
||||||
|
for doc_name, doc_results in results.items():
|
||||||
|
if 'error' in doc_results:
|
||||||
|
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||||
|
else:
|
||||||
|
summary = doc_results['summary']
|
||||||
|
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||||
|
if doc_results['report_filename']:
|
||||||
|
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||||
|
|
||||||
|
elif args.file:
|
||||||
|
document_name = args.document or "Unknown-Document"
|
||||||
|
print(f"📁 Validating from file: {args.file}")
|
||||||
|
|
||||||
|
import json
|
||||||
|
with open(args.file, 'r', encoding='utf-8') as f:
|
||||||
|
slide_data = json.load(f)
|
||||||
|
|
||||||
|
results = validate_document_claims(
|
||||||
|
document_name,
|
||||||
|
slide_data,
|
||||||
|
api_key=api_key,
|
||||||
|
save_report=not args.no_save
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
summary = results['summary']
|
||||||
|
print(f"\n✅ Validation Complete!")
|
||||||
|
print(f"📊 Results Summary:")
|
||||||
|
print(f" - Total Claims Found: {summary['total_claims']}")
|
||||||
|
print(f" - Accurate Claims: {summary['accurate_claims']}")
|
||||||
|
print(f" - Inaccurate Claims: {summary['inaccurate_claims']}")
|
||||||
|
print(f" - Accuracy Rate: {summary['accuracy_rate']:.1f}%")
|
||||||
|
|
||||||
|
if results['report_filename']:
|
||||||
|
print(f"📄 Detailed report saved to: {results['report_filename']}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("📁 Validating all documents in processed/ folder (default)")
|
||||||
|
results = validate_all_processed_documents(api_key=api_key)
|
||||||
|
|
||||||
|
print(f"\n✅ Validation Complete!")
|
||||||
|
print(f"📊 Processed {len(results)} documents:")
|
||||||
|
|
||||||
|
for doc_name, doc_results in results.items():
|
||||||
|
if 'error' in doc_results:
|
||||||
|
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||||
|
else:
|
||||||
|
summary = doc_results['summary']
|
||||||
|
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||||
|
if doc_results['report_filename']:
|
||||||
|
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,233 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
from .rag_agent import ValidationResult, MarketCapClaim
|
||||||
|
|
||||||
|
|
||||||
|
class ValidationReportGenerator:
|
||||||
|
"""
|
||||||
|
Generates comprehensive validation reports for market cap claims
|
||||||
|
with slide source tracking
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.report_sections = []
|
||||||
|
|
||||||
|
def generate_report(self, validation_results: List[ValidationResult],
|
||||||
|
slide_texts: List[Dict[str, Any]]) -> str:
|
||||||
|
"""
|
||||||
|
Generate a comprehensive validation report
|
||||||
|
|
||||||
|
Args:
|
||||||
|
validation_results: List of ValidationResult objects
|
||||||
|
slide_texts: Original slide text data for context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted markdown report string
|
||||||
|
"""
|
||||||
|
report = []
|
||||||
|
|
||||||
|
# Header
|
||||||
|
report.append(self._generate_header())
|
||||||
|
|
||||||
|
# Executive Summary
|
||||||
|
report.append(self._generate_executive_summary(validation_results))
|
||||||
|
|
||||||
|
# Detailed Results
|
||||||
|
report.append(self._generate_detailed_results(validation_results))
|
||||||
|
|
||||||
|
# Slide Source Analysis
|
||||||
|
report.append(self._generate_slide_source_analysis(validation_results, slide_texts))
|
||||||
|
|
||||||
|
# RAG Search Details
|
||||||
|
report.append(self._generate_rag_search_details(validation_results))
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
report.append(self._generate_recommendations(validation_results))
|
||||||
|
|
||||||
|
return '\n\n'.join(report)
|
||||||
|
|
||||||
|
def _generate_header(self) -> str:
|
||||||
|
"""Generate report header"""
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
return f"""# Market Cap Validation Report
|
||||||
|
|
||||||
|
**Generated:** {timestamp}
|
||||||
|
**Report Type:** RAG-Enhanced Validation Analysis
|
||||||
|
**Validation Method:** OpenRouter Web Search Integration
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _generate_executive_summary(self, results: List[ValidationResult]) -> str:
|
||||||
|
"""Generate executive summary section"""
|
||||||
|
total_claims = len(results)
|
||||||
|
accurate_claims = sum(1 for r in results if r.is_accurate)
|
||||||
|
inaccurate_claims = total_claims - accurate_claims
|
||||||
|
high_confidence = sum(1 for r in results if r.confidence_score > 0.7)
|
||||||
|
|
||||||
|
accuracy_rate = (accurate_claims / total_claims * 100) if total_claims > 0 else 0
|
||||||
|
|
||||||
|
return f"""## Executive Summary
|
||||||
|
|
||||||
|
### Key Metrics
|
||||||
|
- **Total Market Cap Claims Analyzed:** {total_claims}
|
||||||
|
- **Claims Validated as Accurate:** {accurate_claims} ({accuracy_rate:.1f}%)
|
||||||
|
- **Claims with Discrepancies:** {inaccurate_claims}
|
||||||
|
- **High Confidence Validations:** {high_confidence}
|
||||||
|
|
||||||
|
### Overall Assessment
|
||||||
|
{'✅ **GOOD** - Most claims appear accurate' if accuracy_rate > 70 else '⚠️ **CAUTION** - Significant discrepancies found' if accuracy_rate < 50 else '🔍 **MIXED** - Some claims require verification'}
|
||||||
|
|
||||||
|
---
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _generate_detailed_results(self, results: List[ValidationResult]) -> str:
|
||||||
|
"""Generate detailed validation results"""
|
||||||
|
if not results:
|
||||||
|
return "## Detailed Results\n\nNo market cap claims found in the analyzed slides.\n\n---"
|
||||||
|
|
||||||
|
report = ["## Detailed Validation Results\n"]
|
||||||
|
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
status_icon = "✅" if result.is_accurate else "❌" if result.discrepancy else "⚠️"
|
||||||
|
confidence_bar = self._generate_confidence_bar(result.confidence_score)
|
||||||
|
|
||||||
|
report.append(f"""### {status_icon} Claim #{i}: {result.claim.company_name}
|
||||||
|
|
||||||
|
**Slide Source:** Slide {result.claim.slide_number}
|
||||||
|
**Claimed Market Cap:** ${result.claim.claimed_market_cap}
|
||||||
|
**Raw Text:** `{result.claim.raw_text}`
|
||||||
|
**Confidence Score:** {confidence_bar} ({result.confidence_score:.2f})
|
||||||
|
|
||||||
|
**Validation Results:**
|
||||||
|
- **Validated Market Cap:** {result.validated_market_cap or 'Not found'}
|
||||||
|
- **Validation Source:** {result.validation_source}
|
||||||
|
- **Accuracy Status:** {'✅ Accurate' if result.is_accurate else '❌ Inaccurate' if result.discrepancy else '⚠️ Uncertain'}
|
||||||
|
""")
|
||||||
|
|
||||||
|
if result.discrepancy:
|
||||||
|
report.append(f"- **Discrepancy:** {result.discrepancy}")
|
||||||
|
|
||||||
|
report.append(f"- **RAG Search Query:** `{result.rag_search_query}`")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("---")
|
||||||
|
return '\n'.join(report)
|
||||||
|
|
||||||
|
def _generate_slide_source_analysis(self, results: List[ValidationResult],
|
||||||
|
slide_texts: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Generate slide source analysis section"""
|
||||||
|
report = ["## Slide Source Analysis\n"]
|
||||||
|
|
||||||
|
# Group results by slide
|
||||||
|
slide_claims = {}
|
||||||
|
for result in results:
|
||||||
|
slide_num = result.claim.slide_number
|
||||||
|
if slide_num not in slide_claims:
|
||||||
|
slide_claims[slide_num] = []
|
||||||
|
slide_claims[slide_num].append(result)
|
||||||
|
|
||||||
|
# Find slide texts
|
||||||
|
slide_text_map = {s.get('slide_number', 0): s.get('text', '') for s in slide_texts}
|
||||||
|
|
||||||
|
for slide_num in sorted(slide_claims.keys()):
|
||||||
|
claims = slide_claims[slide_num]
|
||||||
|
slide_text = slide_text_map.get(slide_num, 'No text available')
|
||||||
|
|
||||||
|
report.append(f"""### Slide {slide_num} Analysis
|
||||||
|
|
||||||
|
**Claims Found:** {len(claims)}
|
||||||
|
**Slide Text Preview:** {slide_text[:200]}{'...' if len(slide_text) > 200 else ''}
|
||||||
|
|
||||||
|
**Claims Details:**""")
|
||||||
|
|
||||||
|
for claim in claims:
|
||||||
|
status = "✅ Accurate" if any(r.claim == claim and r.is_accurate for r in results) else "❌ Inaccurate"
|
||||||
|
report.append(f"- {claim.company_name}: ${claim.claimed_market_cap} - {status}")
|
||||||
|
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("---")
|
||||||
|
return '\n'.join(report)
|
||||||
|
|
||||||
|
def _generate_rag_search_details(self, results: List[ValidationResult]) -> str:
|
||||||
|
"""Generate RAG search details section"""
|
||||||
|
report = ["## RAG Search Details\n"]
|
||||||
|
|
||||||
|
report.append("### Search Methodology")
|
||||||
|
report.append("- **Search Engine:** OpenRouter with Exa integration")
|
||||||
|
report.append("- **Model:** Mistral Small with online search enabled")
|
||||||
|
report.append("- **Search Focus:** Current market cap data (2024-2025)")
|
||||||
|
report.append("- **Validation Threshold:** 80% accuracy tolerance")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### Search Queries Used")
|
||||||
|
unique_queries = list(set(r.rag_search_query for r in results))
|
||||||
|
for i, query in enumerate(unique_queries, 1):
|
||||||
|
report.append(f"{i}. `{query}`")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### Sample RAG Responses")
|
||||||
|
for i, result in enumerate(results[:3], 1): # Show first 3 responses
|
||||||
|
report.append(f"""#### Response #{i}: {result.claim.company_name}
|
||||||
|
```
|
||||||
|
{result.rag_response[:300]}{'...' if len(result.rag_response) > 300 else ''}
|
||||||
|
```""")
|
||||||
|
|
||||||
|
report.append("---")
|
||||||
|
return '\n'.join(report)
|
||||||
|
|
||||||
|
def _generate_recommendations(self, results: List[ValidationResult]) -> str:
|
||||||
|
"""Generate recommendations section"""
|
||||||
|
inaccurate_results = [r for r in results if not r.is_accurate and r.discrepancy]
|
||||||
|
high_confidence_results = [r for r in results if r.confidence_score > 0.7]
|
||||||
|
|
||||||
|
report = ["## Recommendations\n"]
|
||||||
|
|
||||||
|
if inaccurate_results:
|
||||||
|
report.append("### ⚠️ Claims Requiring Attention")
|
||||||
|
for result in inaccurate_results:
|
||||||
|
report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - {result.discrepancy}")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
if high_confidence_results:
|
||||||
|
report.append("### ✅ High Confidence Validations")
|
||||||
|
report.append("The following claims were validated with high confidence:")
|
||||||
|
for result in high_confidence_results:
|
||||||
|
report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - ${result.claim.claimed_market_cap}")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### 📋 General Recommendations")
|
||||||
|
report.append("1. **Verify Discrepancies:** Review claims marked as inaccurate with stakeholders")
|
||||||
|
report.append("2. **Update Sources:** Consider updating slide sources with more recent data")
|
||||||
|
report.append("3. **Regular Validation:** Implement periodic validation of financial claims")
|
||||||
|
report.append("4. **Source Attribution:** Always include data sources and dates in financial slides")
|
||||||
|
|
||||||
|
report.append("\n---")
|
||||||
|
report.append("*Report generated by Market Cap RAG Validation Agent*")
|
||||||
|
|
||||||
|
return '\n'.join(report)
|
||||||
|
|
||||||
|
def _generate_confidence_bar(self, confidence: float) -> str:
|
||||||
|
"""Generate a visual confidence bar"""
|
||||||
|
filled = int(confidence * 10)
|
||||||
|
empty = 10 - filled
|
||||||
|
return f"[{'█' * filled}{'░' * empty}]"
|
||||||
|
|
||||||
|
def save_report(self, report: str, filename: str = None, processed_dir: str = "processed") -> str:
|
||||||
|
"""Save report to file"""
|
||||||
|
if filename is None:
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"market_cap_validation_report_{timestamp}.md"
|
||||||
|
|
||||||
|
# Create processed directory if it doesn't exist
|
||||||
|
os.makedirs(processed_dir, exist_ok=True)
|
||||||
|
filepath = os.path.join(processed_dir, filename)
|
||||||
|
|
||||||
|
with open(filepath, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(report)
|
||||||
|
|
||||||
|
return filepath
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def process_pitch_deck(pdf_path):
|
||||||
|
"""Working version that bypasses the signature mess"""
|
||||||
|
print(f"Processing: {pdf_path}")
|
||||||
|
|
||||||
|
# Import everything we need
|
||||||
|
from client import get_openrouter_client
|
||||||
|
from pdf_processor import extract_slides_from_pdf
|
||||||
|
from analysis import analyze_slides_batch
|
||||||
|
|
||||||
|
# Extract slides (this works)
|
||||||
|
slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
|
||||||
|
print(f"Extracted {len(slides)} slides")
|
||||||
|
|
||||||
|
# Analyze slides (this works)
|
||||||
|
client = get_openrouter_client()
|
||||||
|
analysis_results = analyze_slides_batch(client, slides)
|
||||||
|
print("Analysis complete")
|
||||||
|
|
||||||
|
# Create report manually (bypass the broken create_slide_markdown)
|
||||||
|
markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
|
||||||
|
|
||||||
|
for i, slide_data in enumerate(slides):
|
||||||
|
slide_num = i + 1
|
||||||
|
analysis = analysis_results.get(slide_num, {})
|
||||||
|
|
||||||
|
markdown_content += f"## Slide {slide_num}\n\n"
|
||||||
|
markdown_content += f"\n\n"
|
||||||
|
|
||||||
|
if analysis:
|
||||||
|
markdown_content += f"**Analysis:**\n{analysis}\n\n"
|
||||||
|
else:
|
||||||
|
markdown_content += "**Analysis:** No analysis available\n\n"
|
||||||
|
|
||||||
|
markdown_content += "---\n\n"
|
||||||
|
|
||||||
|
# Save report
|
||||||
|
output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
|
||||||
|
os.makedirs("processed", exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(markdown_content)
|
||||||
|
|
||||||
|
print(f"Report saved to: {output_file}")
|
||||||
|
return output_file
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python working_app.py <pdf_path>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
pdf_path = sys.argv[1]
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
print(f"Error: File '{pdf_path}' not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
process_pitch_deck(pdf_path)
|
||||||
|
After Width: | Height: | Size: 60 KiB |
|
After Width: | Height: | Size: 94 KiB |
|
After Width: | Height: | Size: 86 KiB |
|
After Width: | Height: | Size: 101 KiB |
|
After Width: | Height: | Size: 110 KiB |
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 1.8 MiB |
|
After Width: | Height: | Size: 2.3 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 91 KiB |
|
After Width: | Height: | Size: 93 KiB |
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 62 KiB |
|
After Width: | Height: | Size: 126 KiB |
|
After Width: | Height: | Size: 327 KiB |
|
After Width: | Height: | Size: 93 KiB |
|
After Width: | Height: | Size: 105 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
|
@ -0,0 +1,6 @@
|
||||||
|
pdf2image
|
||||||
|
openai
|
||||||
|
requests
|
||||||
|
PyMuPDF
|
||||||
|
docling
|
||||||
|
python-dotenv
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Kill any process running on port 3123
|
||||||
|
echo "Killing any existing processes on port 3123..."
|
||||||
|
fuser -k 3123/tcp 2>/dev/null || true
|
||||||
|
|
||||||
|
# Create virtual environment if it doesn't exist
|
||||||
|
if [ ! -d "venv" ]; then
|
||||||
|
echo "Creating virtual environment..."
|
||||||
|
python3 -m venv venv
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
echo "Activating virtual environment..."
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# Verify virtual environment is active
|
||||||
|
echo "Verifying virtual environment..."
|
||||||
|
which python3
|
||||||
|
python3 --version
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "Installing dependencies..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Check for help flag
|
||||||
|
if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
|
||||||
|
echo ""
|
||||||
|
echo "Pitch Deck Analysis Application"
|
||||||
|
echo "=============================="
|
||||||
|
echo "Usage: ./start.sh <file_path>"
|
||||||
|
echo "Example: ./start.sh presentation.pdf"
|
||||||
|
echo ""
|
||||||
|
echo "The application will automatically upload the generated report."
|
||||||
|
echo ""
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify file exists
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "Error: No file specified"
|
||||||
|
echo "Usage: ./start.sh <file_path>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$1" ]; then
|
||||||
|
echo "Error: File '$1' not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start the application with immediate feedback
|
||||||
|
echo "Starting pitch deck parser..."
|
||||||
|
echo "Processing file: $1"
|
||||||
|
echo "Python path: $(which python3)"
|
||||||
|
echo "Working directory: $(pwd)"
|
||||||
|
echo "----------------------------------------"
|
||||||
|
|
||||||
|
python3 app.py "$1"
|
||||||