Initial commit: Technical screen project with document analysis capabilities
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
alwaysApply: true
|
||||
---
|
||||
# Code Cleanup Guidelines
|
||||
Remove unused code, imports, and dead functions to keep the codebase clean and maintainable. Regular cleanup prevents technical debt and improves code readability.
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
alwaysApply: true
|
||||
---
|
||||
# Code Length Guidelines
|
||||
Keep all code files under 300 lines for better maintainability and readability. If a file exceeds this limit, consider breaking it into smaller, focused modules.
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
# Environment variables
|
||||
.env
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
def generate_toc(markdown_content):
|
||||
"""Generate a Table of Contents from markdown headers"""
|
||||
print(" 📋 Generating Table of Contents...")
|
||||
lines = markdown_content.split('\n')
|
||||
toc_lines = []
|
||||
toc_lines.append("## Table of Contents")
|
||||
toc_lines.append("")
|
||||
|
||||
header_count = 0
|
||||
for line in lines:
|
||||
# Match headers (##, ###, etc.)
|
||||
header_match = re.match(r'^(#{2,})\s+(.+)$', line)
|
||||
if header_match:
|
||||
header_count += 1
|
||||
level = len(header_match.group(1)) - 2 # Convert ## to 0, ### to 1, etc.
|
||||
title = header_match.group(2)
|
||||
|
||||
# Create anchor link
|
||||
anchor = re.sub(r'[^a-zA-Z0-9\s-]', '', title.lower())
|
||||
anchor = re.sub(r'\s+', '-', anchor.strip())
|
||||
|
||||
# Add indentation based on header level
|
||||
indent = " " * level
|
||||
toc_lines.append(f"{indent}- [{title}](#{anchor})")
|
||||
|
||||
toc_lines.append("")
|
||||
toc_lines.append("---")
|
||||
toc_lines.append("")
|
||||
|
||||
print(f" ✅ Generated TOC with {header_count} headers")
|
||||
return '\n'.join(toc_lines)
|
||||
|
||||
def main():
|
||||
"""Simple pitch deck analyzer"""
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python app.py <pdf_file>")
|
||||
return
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"Error: File '{pdf_path}' not found")
|
||||
return
|
||||
|
||||
print(f"🚀 Processing: {pdf_path}")
|
||||
|
||||
# Import what we need directly (avoid __init__.py issues)
|
||||
print("📦 Importing modules...")
|
||||
sys.path.append('modules')
|
||||
from client import get_openrouter_client
|
||||
from pdf_processor import extract_slides_from_pdf
|
||||
from analysis import analyze_slides_batch
|
||||
from markdown_utils import send_to_api_and_get_haste_link
|
||||
print("✅ Modules imported successfully")
|
||||
|
||||
# Extract slides
|
||||
print("📄 Extracting slides...")
|
||||
slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
|
||||
print(f"✅ Extracted {len(slides)} slides")
|
||||
|
||||
# Analyze slides
|
||||
print("🧠 Analyzing slides...")
|
||||
client = get_openrouter_client()
|
||||
print("🔗 API client initialized")
|
||||
|
||||
analysis_results = analyze_slides_batch(client, slides)
|
||||
print("✅ Analysis complete")
|
||||
|
||||
# Create report
|
||||
print("📝 Creating report...")
|
||||
markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
|
||||
|
||||
# Add analysis metadata
|
||||
markdown_content += "This analysis was generated using multiple AI agents, each specialized in different aspects of slide evaluation.\n\n"
|
||||
markdown_content += f"**Source File:** `{Path(pdf_path).name}` (PDF)\n"
|
||||
markdown_content += f"**Analysis Generated:** {len(slides)} slides processed\n"
|
||||
markdown_content += "**Processing Method:** Individual processing with specialized AI agents\n"
|
||||
markdown_content += "**Text Extraction:** Docling-powered text transcription\n\n"
|
||||
|
||||
print(f"📊 Building markdown for {len(slides)} slides...")
|
||||
for i, slide_data in enumerate(slides):
|
||||
slide_num = i + 1
|
||||
analysis = analysis_results.get(slide_num, {})
|
||||
|
||||
print(f" 📄 Processing slide {slide_num}...")
|
||||
|
||||
markdown_content += f"# Slide {slide_num}\n\n"
|
||||
markdown_content += f"\n\n"
|
||||
|
||||
if analysis:
|
||||
markdown_content += "## Agentic Analysis\n\n"
|
||||
|
||||
# Format each agent's analysis
|
||||
agent_count = 0
|
||||
for agent_key, agent_data in analysis.items():
|
||||
if isinstance(agent_data, dict) and 'agent' in agent_data and 'analysis' in agent_data:
|
||||
agent_count += 1
|
||||
agent_name = agent_data['agent']
|
||||
agent_analysis = agent_data['analysis']
|
||||
|
||||
markdown_content += f"### {agent_name}\n\n"
|
||||
markdown_content += f"{agent_analysis}\n\n"
|
||||
|
||||
print(f" ✅ Added {agent_count} agent analyses")
|
||||
else:
|
||||
markdown_content += "## Agentic Analysis\n\n"
|
||||
markdown_content += "No analysis available\n\n"
|
||||
print(f" ⚠️ No analysis available for slide {slide_num}")
|
||||
|
||||
markdown_content += "---\n\n"
|
||||
|
||||
# Generate Table of Contents
|
||||
print("📋 Generating Table of Contents...")
|
||||
toc = generate_toc(markdown_content)
|
||||
|
||||
# Insert TOC after the main title
|
||||
print("🔗 Inserting TOC into document...")
|
||||
lines = markdown_content.split('\n')
|
||||
final_content = []
|
||||
final_content.append(lines[0]) # Main title
|
||||
final_content.append("") # Empty line
|
||||
final_content.append(toc) # TOC
|
||||
final_content.extend(lines[2:]) # Rest of content
|
||||
|
||||
final_markdown = '\n'.join(final_content)
|
||||
|
||||
# Save report
|
||||
output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
|
||||
print(f"💾 Saving report to: {output_file}")
|
||||
os.makedirs("processed", exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(final_markdown)
|
||||
|
||||
print(f"✅ Report saved successfully ({len(final_markdown)} characters)")
|
||||
|
||||
# Always upload the report
|
||||
print("🌐 Uploading report...")
|
||||
haste_url = send_to_api_and_get_haste_link(final_markdown, Path(pdf_path).stem)
|
||||
if haste_url:
|
||||
print(f"✅ Report uploaded to: {haste_url}")
|
||||
else:
|
||||
print("❌ Upload failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
# OpenRouter API Configuration
|
||||
OPENROUTER_API_KEY=your_openrouter_api_key_here
|
||||
|
||||
# Optional: Custom OpenAI model (defaults to gpt-3.5-turbo)
|
||||
# OPENROUTER_MODEL=openai/gpt-3.5-turbo
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Pitch Deck Parser Modules
|
||||
# This package contains all the modular components for the pitch deck analysis application
|
||||
|
||||
from .client import get_openrouter_client
|
||||
from .file_utils import detect_file_type, convert_to_pdf, convert_with_libreoffice
|
||||
from .pdf_processor import extract_slides_from_pdf
|
||||
from .docling_processor import extract_text_with_docling, get_slide_text_content
|
||||
from .analysis import (
|
||||
analyze_slide_with_single_prompt,
|
||||
analyze_slides_batch,
|
||||
analyze_slide_with_agentic_prompts_parallel,
|
||||
process_single_slide_parallel
|
||||
)
|
||||
from .markdown_utils import (
|
||||
create_slide_markdown,
|
||||
create_text_only_markdown,
|
||||
send_to_api_and_get_haste_link
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'get_openrouter_client',
|
||||
'detect_file_type',
|
||||
'convert_to_pdf',
|
||||
'convert_with_libreoffice',
|
||||
'extract_slides_from_pdf',
|
||||
'extract_text_with_docling',
|
||||
'get_slide_text_content',
|
||||
'analyze_slide_with_single_prompt',
|
||||
'analyze_slides_batch',
|
||||
'analyze_slide_with_agentic_prompts_parallel',
|
||||
'process_single_slide_parallel',
|
||||
'create_slide_markdown',
|
||||
'create_text_only_markdown',
|
||||
'send_to_api_and_get_haste_link'
|
||||
]
|
||||
|
||||
# Market Cap RAG Validation
|
||||
from .rag_agent import MarketCapRAGAgent, MarketCapClaim, ValidationResult
|
||||
from .validation_report import ValidationReportGenerator
|
||||
from .market_cap_validator import (
|
||||
MarketCapValidator,
|
||||
validate_market_caps,
|
||||
validate_market_caps_from_file,
|
||||
validate_market_caps_from_processed
|
||||
)
|
||||
|
||||
# Update __all__ list
|
||||
__all__.extend([
|
||||
'MarketCapRAGAgent',
|
||||
'MarketCapClaim',
|
||||
'ValidationResult',
|
||||
'ValidationReportGenerator',
|
||||
'MarketCapValidator',
|
||||
'validate_market_caps',
|
||||
'validate_market_caps_from_file',
|
||||
'validate_market_caps_from_processed'
|
||||
])
|
||||
|
||||
# Document-specific validation
|
||||
from .document_validator import (
|
||||
DocumentValidator,
|
||||
validate_document_claims,
|
||||
validate_all_processed_documents
|
||||
)
|
||||
|
||||
# Update __all__ list
|
||||
__all__.extend([
|
||||
'DocumentValidator',
|
||||
'validate_document_claims',
|
||||
'validate_all_processed_documents'
|
||||
])
|
||||
|
||||
# Main application and CLI tools
|
||||
from .app import *
|
||||
from .example_usage import *
|
||||
from .validate_market_caps import *
|
||||
|
||||
# Update __all__ list
|
||||
__all__.extend([
|
||||
'app',
|
||||
'example_usage',
|
||||
'validate_market_caps'
|
||||
])
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
import re
|
||||
from client import get_openrouter_client
|
||||
|
||||
def analyze_slides_batch(client, slides_data, batch_size=1):
|
||||
"""Process slides individually with specialized AI agents"""
|
||||
print(f" Processing {len(slides_data)} slides individually...")
|
||||
|
||||
all_results = {}
|
||||
|
||||
for i, slide_data in enumerate(slides_data):
|
||||
slide_num = slide_data["page_num"]
|
||||
print(f" 🔍 Analyzing slide {slide_num} ({i+1}/{len(slides_data)})...")
|
||||
|
||||
# Define specialized agents
|
||||
agents = {
|
||||
'content_extractor': {
|
||||
'name': 'Content Extractor',
|
||||
'prompt': 'Extract and summarize the key textual content from this slide. Focus on headlines, bullet points, and main messages.'
|
||||
},
|
||||
'visual_analyzer': {
|
||||
'name': 'Visual Analyzer',
|
||||
'prompt': 'Analyze the visual design elements of this slide. Comment on layout, colors, typography, and visual hierarchy.'
|
||||
},
|
||||
'data_interpreter': {
|
||||
'name': 'Data Interpreter',
|
||||
'prompt': 'Identify and interpret any numerical data, charts, graphs, or metrics present on this slide.'
|
||||
},
|
||||
'message_evaluator': {
|
||||
'name': 'Message Evaluator',
|
||||
'prompt': 'Evaluate the effectiveness of the message delivery and communication strategy on this slide.'
|
||||
},
|
||||
'improvement_suggestor': {
|
||||
'name': 'Improvement Suggestor',
|
||||
'prompt': 'Suggest specific improvements for this slide in terms of clarity, impact, and effectiveness.'
|
||||
}
|
||||
}
|
||||
|
||||
slide_analysis = {}
|
||||
|
||||
# Analyze with each specialized agent
|
||||
for j, (agent_key, agent_config) in enumerate(agents.items()):
|
||||
print(f" 🤖 Running {agent_config['name']} ({j+1}/5)...")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": f"You are a {agent_config['name']} specialized in analyzing pitch deck slides. {agent_config['prompt']}"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": f"Analyze slide {slide_num}:"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{slide_data['base64']}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
try:
|
||||
print(f" 📡 Sending API request...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=messages,
|
||||
max_tokens=500
|
||||
)
|
||||
|
||||
analysis = response.choices[0].message.content.strip()
|
||||
print(f" ✅ {agent_config['name']} completed ({len(analysis)} chars)")
|
||||
|
||||
slide_analysis[agent_key] = {
|
||||
'agent': agent_config['name'],
|
||||
'analysis': analysis
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ {agent_config['name']} failed: {str(e)}")
|
||||
slide_analysis[agent_key] = {
|
||||
'agent': agent_config['name'],
|
||||
'analysis': f"Error analyzing slide {slide_num}: {str(e)}"
|
||||
}
|
||||
|
||||
all_results[slide_num] = slide_analysis
|
||||
print(f" ✅ Slide {slide_num} analysis complete")
|
||||
|
||||
print(f" 🎉 All {len(slides_data)} slides analyzed successfully!")
|
||||
return all_results
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
def get_openrouter_client():
|
||||
"""Initialize OpenRouter client with API key from .env file"""
|
||||
# Load .env file
|
||||
load_dotenv()
|
||||
|
||||
api_key = os.getenv('OPENROUTER_API_KEY')
|
||||
if not api_key or api_key == 'your_openrouter_api_key_here':
|
||||
print("❌ Error: OPENROUTER_API_KEY not properly set in .env file")
|
||||
print("Please update your .env file with a valid OpenRouter API key")
|
||||
sys.exit(1)
|
||||
|
||||
return OpenAI(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_key=api_key
|
||||
)
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
from pathlib import Path
|
||||
import fitz # PyMuPDF as fallback
|
||||
import re
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
"""Clean text to ensure it's plaintext with no special characters or LaTeX"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Remove LaTeX commands and math expressions
|
||||
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
|
||||
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
|
||||
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
|
||||
|
||||
# Remove special characters and normalize
|
||||
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
|
||||
|
||||
# Clean up multiple spaces and newlines
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_text_with_docling(pdf_path, output_dir, document_name):
|
||||
"""Extract text content from PDF using Docling with PyMuPDF fallback"""
|
||||
print(f"Extracting text content with Docling: {pdf_path}")
|
||||
|
||||
try:
|
||||
# Initialize Docling converter
|
||||
converter = DocumentConverter()
|
||||
# Configure OCR for better text extraction
|
||||
converter.ocr_options.engine = "rapidocr" # Use faster OCR engine
|
||||
converter.ocr_options.do_ocr = True
|
||||
converter.ocr_options.do_table_ocr = True
|
||||
|
||||
# Convert PDF to text
|
||||
result = converter.convert(pdf_path)
|
||||
|
||||
# Get the text content
|
||||
text_content = result.document.export_to_markdown()
|
||||
|
||||
# Clean the text to ensure it's plaintext
|
||||
text_content = clean_text(text_content)
|
||||
|
||||
# Create processed directory structure if it doesn't exist
|
||||
processed_dir = Path("processed") / document_name
|
||||
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save the text content to a file
|
||||
text_file = processed_dir / f"{document_name}_text_content.md"
|
||||
with open(text_file, 'w', encoding='utf-8') as f:
|
||||
f.write(text_content)
|
||||
|
||||
print(f"✅ Text content extracted and saved to: {text_file}")
|
||||
|
||||
return {
|
||||
'text_content': text_content,
|
||||
'text_file': text_file,
|
||||
'processed_dir': processed_dir
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Docling failed: {e}")
|
||||
print("🔄 Trying PyMuPDF fallback...")
|
||||
|
||||
# Fallback to PyMuPDF
|
||||
try:
|
||||
text_content = extract_text_with_pymupdf(pdf_path)
|
||||
|
||||
if text_content:
|
||||
# Clean the text to ensure it's plaintext
|
||||
text_content = clean_text(text_content)
|
||||
|
||||
# Create processed directory structure if it doesn't exist
|
||||
processed_dir = Path("processed") / document_name
|
||||
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save the text content to a file
|
||||
text_file = processed_dir / f"{document_name}_text_content.md"
|
||||
with open(text_file, 'w', encoding='utf-8') as f:
|
||||
f.write(text_content)
|
||||
|
||||
print(f"✅ Text content extracted with PyMuPDF fallback: {text_file}")
|
||||
|
||||
return {
|
||||
'text_content': text_content,
|
||||
'text_file': text_file,
|
||||
'processed_dir': processed_dir
|
||||
}
|
||||
else:
|
||||
print("⚠️ PyMuPDF fallback also failed")
|
||||
return None
|
||||
|
||||
except Exception as fallback_error:
|
||||
print(f"❌ PyMuPDF fallback also failed: {fallback_error}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_text_with_pymupdf(pdf_path):
|
||||
"""Extract text using PyMuPDF as fallback with clean formatting"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
text_content = ""
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
|
||||
# Extract text with better formatting
|
||||
page_text = page.get_text()
|
||||
|
||||
# Clean the page text
|
||||
page_text = clean_text(page_text)
|
||||
|
||||
# Add page separator
|
||||
text_content += f"\n--- Page {page_num + 1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
|
||||
doc.close()
|
||||
return text_content
|
||||
|
||||
except Exception as e:
|
||||
print(f"PyMuPDF extraction failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_slide_text_content(text_content, slide_num):
|
||||
"""Extract text content for a specific slide from the full document text"""
|
||||
try:
|
||||
if not text_content:
|
||||
return ""
|
||||
|
||||
# Split by page separators
|
||||
pages = text_content.split('--- Page')
|
||||
|
||||
# Find the page for this slide
|
||||
target_page = None
|
||||
for page in pages:
|
||||
if page.strip().startswith(f" {slide_num} ---"):
|
||||
target_page = page
|
||||
break
|
||||
|
||||
if target_page:
|
||||
# Remove the page header and clean up
|
||||
lines = target_page.split('\n')[1:] # Remove page header
|
||||
slide_text = '\n'.join(lines).strip()
|
||||
|
||||
# Further clean the slide text
|
||||
slide_text = clean_text(slide_text)
|
||||
|
||||
return slide_text
|
||||
else:
|
||||
# Fallback: try to extract from sections
|
||||
sections = text_content.split('\n\n')
|
||||
if slide_num <= len(sections):
|
||||
return clean_text(sections[slide_num - 1] if slide_num > 0 else sections[0])
|
||||
else:
|
||||
# Return a portion of the text content
|
||||
lines = text_content.split('\n')
|
||||
start_line = (slide_num - 1) * 5 # Approximate 5 lines per slide
|
||||
end_line = min(start_line + 10, len(lines)) # Up to 10 lines
|
||||
slide_text = '\n'.join(lines[start_line:end_line])
|
||||
return clean_text(slide_text)
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error extracting text for slide {slide_num}: {e}")
|
||||
return f"[Text content for slide {slide_num} could not be extracted]"
|
||||
|
|
@ -0,0 +1,199 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Document-specific validator that organizes reports by document in processed directory
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .rag_agent import MarketCapRAGAgent
|
||||
from .validation_report import ValidationReportGenerator
|
||||
|
||||
|
||||
class DocumentValidator:
|
||||
"""
|
||||
Validates financial claims for specific documents with proper directory organization
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None):
|
||||
self.rag_agent = MarketCapRAGAgent(api_key)
|
||||
self.report_generator = ValidationReportGenerator()
|
||||
|
||||
def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]],
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate financial claims for a specific document
|
||||
|
||||
Args:
|
||||
document_name: Name of the document (e.g., "Uber-Pitch-Deck")
|
||||
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
print(f"🔍 Validating financial claims for: {document_name}")
|
||||
|
||||
# Extract and validate claims
|
||||
validation_results = self.rag_agent.validate_all_claims(slide_texts)
|
||||
|
||||
# Generate report
|
||||
report = self.report_generator.generate_report(validation_results, slide_texts)
|
||||
|
||||
# Save report in proper directory structure
|
||||
report_filename = None
|
||||
if save_report:
|
||||
# Create document-specific directory
|
||||
doc_dir = os.path.join("processed", document_name)
|
||||
os.makedirs(doc_dir, exist_ok=True)
|
||||
|
||||
# Save report in document directory
|
||||
report_filename = self.report_generator.save_report(
|
||||
report,
|
||||
f"{document_name}_market_cap_validation.md",
|
||||
doc_dir
|
||||
)
|
||||
print(f"📄 Validation report saved to: {report_filename}")
|
||||
|
||||
# Prepare summary
|
||||
summary = self._generate_summary(validation_results)
|
||||
|
||||
return {
|
||||
'document_name': document_name,
|
||||
'validation_results': validation_results,
|
||||
'report': report,
|
||||
'report_filename': report_filename,
|
||||
'summary': summary
|
||||
}
|
||||
|
||||
def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
|
||||
"""
|
||||
Validate all documents in the processed folder
|
||||
|
||||
Args:
|
||||
folder_path: Path to processed folder
|
||||
|
||||
Returns:
|
||||
Dictionary with results for each document
|
||||
"""
|
||||
results = {}
|
||||
|
||||
if not os.path.exists(folder_path):
|
||||
raise ValueError(f"Processed folder not found: {folder_path}")
|
||||
|
||||
# Find all document directories
|
||||
for item in os.listdir(folder_path):
|
||||
item_path = os.path.join(folder_path, item)
|
||||
if os.path.isdir(item_path) and not item.startswith('.'):
|
||||
# Look for text content files
|
||||
text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
|
||||
|
||||
if text_files:
|
||||
document_name = item
|
||||
text_file = os.path.join(item_path, text_files[0])
|
||||
|
||||
print(f"📁 Processing document: {document_name}")
|
||||
|
||||
# Read text content
|
||||
with open(text_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Convert to slide format
|
||||
slide_texts = [{
|
||||
"slide_number": 1,
|
||||
"text": content
|
||||
}]
|
||||
|
||||
# Validate document
|
||||
try:
|
||||
doc_results = self.validate_document(document_name, slide_texts)
|
||||
results[document_name] = doc_results
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing {document_name}: {e}")
|
||||
results[document_name] = {'error': str(e)}
|
||||
|
||||
return results
|
||||
|
||||
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
|
||||
"""Generate a summary of validation results"""
|
||||
total_claims = len(validation_results)
|
||||
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
|
||||
inaccurate_claims = total_claims - accurate_claims
|
||||
|
||||
return {
|
||||
'total_claims': total_claims,
|
||||
'accurate_claims': accurate_claims,
|
||||
'inaccurate_claims': inaccurate_claims,
|
||||
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
|
||||
'claims_by_slide': self._group_claims_by_slide(validation_results)
|
||||
}
|
||||
|
||||
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
|
||||
"""Group claims by slide number"""
|
||||
claims_by_slide = {}
|
||||
for result in validation_results:
|
||||
slide_num = result.claim.slide_number
|
||||
if slide_num not in claims_by_slide:
|
||||
claims_by_slide[slide_num] = []
|
||||
claims_by_slide[slide_num].append(result)
|
||||
return claims_by_slide
|
||||
|
||||
|
||||
def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]],
|
||||
api_key: Optional[str] = None,
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to validate claims for a specific document
|
||||
|
||||
Args:
|
||||
document_name: Name of the document
|
||||
slide_texts: List of slide data
|
||||
api_key: OpenRouter API key (optional)
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
validator = DocumentValidator(api_key)
|
||||
return validator.validate_document(document_name, slide_texts, save_report)
|
||||
|
||||
|
||||
def validate_all_processed_documents(folder_path: str = "processed",
|
||||
api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to validate all documents in processed folder
|
||||
|
||||
Args:
|
||||
folder_path: Path to processed folder
|
||||
api_key: OpenRouter API key (optional)
|
||||
|
||||
Returns:
|
||||
Dictionary with results for each document
|
||||
"""
|
||||
validator = DocumentValidator(api_key)
|
||||
return validator.validate_from_processed_folder(folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
print("Document Validator - RAG Agent")
|
||||
print("===============================")
|
||||
|
||||
try:
|
||||
results = validate_all_processed_documents()
|
||||
|
||||
print(f"\n✅ Validation Complete!")
|
||||
print(f"📊 Processed {len(results)} documents:")
|
||||
|
||||
for doc_name, doc_results in results.items():
|
||||
if 'error' in doc_results:
|
||||
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||
else:
|
||||
summary = doc_results['summary']
|
||||
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||
if doc_results['report_filename']:
|
||||
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def detect_file_type(file_path):
|
||||
"""Detect file type based on extension"""
|
||||
file_ext = Path(file_path).suffix.lower()
|
||||
|
||||
file_types = {
|
||||
'.pdf': 'pdf',
|
||||
'.pptx': 'powerpoint',
|
||||
'.ppt': 'powerpoint',
|
||||
'.docx': 'word',
|
||||
'.doc': 'word',
|
||||
'.odp': 'openoffice_presentation',
|
||||
'.odt': 'openoffice_document'
|
||||
}
|
||||
|
||||
return file_types.get(file_ext, 'unknown')
|
||||
|
||||
|
||||
def convert_to_pdf(input_file, output_dir, document_name):
|
||||
"""Convert various file types to PDF"""
|
||||
file_type = detect_file_type(input_file)
|
||||
|
||||
if file_type == 'pdf':
|
||||
print("✅ File is already PDF, no conversion needed")
|
||||
return input_file
|
||||
|
||||
print(f"🔄 Converting {file_type} file to PDF...")
|
||||
|
||||
# Create temporary PDF file
|
||||
temp_pdf = output_dir + "/" + f"{document_name}_temp.pdf"
|
||||
|
||||
try:
|
||||
if file_type == 'powerpoint':
|
||||
# Convert PowerPoint to PDF using pptxtopdf
|
||||
print(" Using pptxtopdf for PowerPoint conversion...")
|
||||
result = subprocess.run([
|
||||
'python', '-c',
|
||||
f'import pptxtopdf; pptxtopdf.convert("{input_file}", "{temp_pdf}")'
|
||||
], capture_output=True, text=True, timeout=60)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ pptxtopdf failed: {result.stderr}")
|
||||
# Fallback: try using LibreOffice
|
||||
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||
|
||||
elif file_type in ['word', 'openoffice_document']:
|
||||
# Convert Word documents using LibreOffice
|
||||
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||
|
||||
elif file_type == 'openoffice_presentation':
|
||||
# Convert OpenOffice presentations using LibreOffice
|
||||
return convert_with_libreoffice(input_file, temp_pdf, file_type)
|
||||
|
||||
else:
|
||||
print(f"❌ Unsupported file type: {file_type}")
|
||||
return None
|
||||
|
||||
if temp_pdf.exists():
|
||||
print(f"✅ Successfully converted to PDF: {temp_pdf}")
|
||||
return str(temp_pdf)
|
||||
else:
|
||||
print("❌ Conversion failed - PDF file not created")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("❌ Conversion timed out")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"❌ Conversion error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def convert_with_libreoffice(input_file, output_pdf, file_type):
|
||||
"""Convert files using LibreOffice as fallback"""
|
||||
try:
|
||||
print(f" Using LibreOffice for {file_type} conversion...")
|
||||
|
||||
# LibreOffice command
|
||||
cmd = [
|
||||
'soffice', '--headless', '--convert-to', 'pdf',
|
||||
'--outdir', str(output_pdf.parent),
|
||||
str(input_file)
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode == 0:
|
||||
# LibreOffice creates PDF with same name as input
|
||||
input_name = Path(input_file).stem
|
||||
libreoffice_pdf = os.path.dirname(output_pdf) + "/" + f"{input_name}.pdf"
|
||||
|
||||
if libreoffice_pdf.exists():
|
||||
# Rename to our expected temp name
|
||||
libreoffice_pdf.rename(output_pdf)
|
||||
print(f"✅ LibreOffice conversion successful: {output_pdf}")
|
||||
return str(output_pdf)
|
||||
|
||||
print(f"⚠️ LibreOffice conversion failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("❌ LibreOffice conversion timed out")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"❌ LibreOffice conversion error: {e}")
|
||||
return None
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
def clean_markdown_text(text):
|
||||
"""Clean markdown text to ensure it's plaintext with no special characters"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Remove LaTeX commands and math expressions
|
||||
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text) # Remove \command{content}
|
||||
text = re.sub(r'\$[^$]*\$', '', text) # Remove $math$ expressions
|
||||
text = re.sub(r'\\[a-zA-Z]+', '', text) # Remove remaining \commands
|
||||
|
||||
# Remove markdown formatting but keep the text
|
||||
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # Remove bold **text**
|
||||
text = re.sub(r'\*([^*]+)\*', r'\1', text) # Remove italic *text*
|
||||
text = re.sub(r'`([^`]+)`', r'\1', text) # Remove code `text`
|
||||
text = re.sub(r'#{1,6}\s*', '', text) # Remove headers # ## ###
|
||||
|
||||
# Remove special characters but keep basic punctuation
|
||||
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\'\/\&\%\@\#\$\+\=\<\>]', ' ', text)
|
||||
|
||||
# Clean up multiple spaces and newlines
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def create_slide_markdown(slide_data, analysis_results, slide_num, slide_text=""):
|
||||
"""Create markdown content for a single slide with all agentic analyses and text content"""
|
||||
|
||||
markdown = f"""# Slide {slide_num}
|
||||
|
||||

|
||||
|
||||
"""
|
||||
|
||||
# Add text content if available
|
||||
if slide_text and slide_text.strip():
|
||||
# Clean the slide text to ensure it's plaintext
|
||||
clean_slide_text = clean_markdown_text(slide_text)
|
||||
markdown += f"""## Text Content
|
||||
|
||||
{clean_slide_text}
|
||||
|
||||
"""
|
||||
|
||||
markdown += """## Agentic Analysis
|
||||
|
||||
"""
|
||||
|
||||
for prompt_key, result in analysis_results.items():
|
||||
# Clean the analysis text to ensure it's plaintext
|
||||
clean_analysis = clean_markdown_text(result['analysis'])
|
||||
|
||||
markdown += f"""### {result['agent']}
|
||||
|
||||
{clean_analysis}
|
||||
|
||||
"""
|
||||
|
||||
markdown += "---\n\n"
|
||||
return markdown
|
||||
|
||||
|
||||
def create_text_only_markdown(markdown_content):
|
||||
"""Create a text-only version of markdown without image references for API submission"""
|
||||
# Remove image markdown blocks but keep the text descriptions and analysis
|
||||
text_only = markdown_content
|
||||
|
||||
# Remove image embedding lines
|
||||
text_only = re.sub(r'!\[.*?\]\(slides/.*?\)\n', '', text_only)
|
||||
|
||||
# Remove image link lines
|
||||
text_only = re.sub(r'\*\[View full size: slides/.*?\]\(slides/.*?\)\*\n', '', text_only)
|
||||
|
||||
# Remove horizontal rules that were added for slide separation
|
||||
text_only = re.sub(r'^---\n', '', text_only, flags=re.MULTILINE)
|
||||
|
||||
# Clean up extra newlines
|
||||
text_only = re.sub(r'\n{3,}', '\n\n', text_only)
|
||||
|
||||
# Apply final text cleaning to ensure plaintext
|
||||
text_only = clean_markdown_text(text_only)
|
||||
|
||||
return text_only.strip()
|
||||
|
||||
|
||||
def send_to_api_and_get_haste_link(markdown_content, document_title):
|
||||
"""Send markdown to API and get both raw markdown and HTML URLs"""
|
||||
try:
|
||||
print("Sending to API for URLs...")
|
||||
|
||||
# Create text-only version for API
|
||||
text_only_markdown = create_text_only_markdown(markdown_content)
|
||||
|
||||
# First, send raw markdown to haste.nixc.us
|
||||
raw_haste_url = None
|
||||
try:
|
||||
print(" 📝 Creating raw markdown URL...")
|
||||
raw_response = requests.post(
|
||||
"https://haste.nixc.us/documents",
|
||||
data=text_only_markdown.encode('utf-8'),
|
||||
headers={"Content-Type": "text/plain"},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if raw_response.status_code == 200:
|
||||
raw_token = raw_response.text.strip().strip('"')
|
||||
# Extract just the token from JSON response if needed
|
||||
if raw_token.startswith('{"key":"') and raw_token.endswith('"}'):
|
||||
import json
|
||||
try:
|
||||
token_data = json.loads(raw_token)
|
||||
raw_token = token_data['key']
|
||||
except:
|
||||
pass
|
||||
raw_haste_url = f"https://haste.nixc.us/{raw_token}"
|
||||
print(f" ✅ Raw markdown URL created")
|
||||
else:
|
||||
print(f" ⚠️ Raw markdown upload failed with status {raw_response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to create raw markdown URL: {e}")
|
||||
|
||||
# Then, send to md.colinknapp.com for HTML version
|
||||
html_url = None
|
||||
try:
|
||||
print(" 🎨 Creating HTML version URL...")
|
||||
api_data = {
|
||||
"markdown": text_only_markdown,
|
||||
"format": "html",
|
||||
"template": "playful",
|
||||
"title": f"Pitch Deck Analysis: {document_title}",
|
||||
"subtitle": "AI-Generated Analysis with Agentic Insights",
|
||||
"contact": "Generated by Pitch Deck Parser",
|
||||
"send_to_haste": True
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://md.colinknapp.com/api/convert",
|
||||
headers={"Content-Type": "application/json"},
|
||||
data=json.dumps(api_data),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if 'haste_url' in result:
|
||||
# Extract token from haste_url and format as requested
|
||||
haste_url = result['haste_url']
|
||||
if 'haste.nixc.us/' in haste_url:
|
||||
token = haste_url.split('haste.nixc.us/')[-1]
|
||||
html_url = f"https://md.colinknapp.com/haste/{token}"
|
||||
else:
|
||||
html_url = haste_url
|
||||
print(f" ✅ HTML version URL created")
|
||||
else:
|
||||
print(" ⚠️ API response missing haste_url")
|
||||
else:
|
||||
print(f" ⚠️ HTML API request failed with status {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to create HTML URL: {e}")
|
||||
|
||||
return raw_haste_url, html_url
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to send to API: {e}")
|
||||
return None, None
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Market Cap Validator - Main Interface
|
||||
|
||||
This module provides a simple interface to validate market cap claims
|
||||
from pitch deck slides using RAG search capabilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .rag_agent import MarketCapRAGAgent
|
||||
from .validation_report import ValidationReportGenerator
|
||||
|
||||
|
||||
class MarketCapValidator:
|
||||
"""
|
||||
Main interface for market cap validation using RAG search
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None):
|
||||
"""
|
||||
Initialize the market cap validator
|
||||
|
||||
Args:
|
||||
api_key: OpenRouter API key (if not provided, will use environment variable)
|
||||
"""
|
||||
self.rag_agent = MarketCapRAGAgent(api_key)
|
||||
self.report_generator = ValidationReportGenerator()
|
||||
|
||||
def validate_from_slides(self, slide_texts: List[Dict[str, Any]],
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate market cap claims from slide text exports
|
||||
|
||||
Args:
|
||||
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
print("🔍 Starting market cap validation process...")
|
||||
|
||||
# Extract and validate claims
|
||||
validation_results = self.rag_agent.validate_all_claims(slide_texts)
|
||||
|
||||
# Generate report
|
||||
report = self.report_generator.generate_report(validation_results, slide_texts)
|
||||
|
||||
# Save report if requested
|
||||
report_filename = None
|
||||
if save_report:
|
||||
report_filename = self.report_generator.save_report(report)
|
||||
print(f"📄 Validation report saved to: {report_filename}")
|
||||
|
||||
# Prepare summary
|
||||
summary = self._generate_summary(validation_results)
|
||||
|
||||
return {
|
||||
'validation_results': validation_results,
|
||||
'report': report,
|
||||
'report_filename': report_filename,
|
||||
'summary': summary
|
||||
}
|
||||
|
||||
def validate_from_file(self, file_path: str, save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate market cap claims from a JSON file containing slide texts
|
||||
|
||||
Args:
|
||||
file_path: Path to JSON file with slide data
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
slide_texts = json.load(f)
|
||||
|
||||
print(f"📁 Loaded slide data from: {file_path}")
|
||||
return self.validate_from_slides(slide_texts, save_report)
|
||||
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON file: {e}")
|
||||
|
||||
def validate_from_processed_folder(self, folder_path: str = "processed",
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate market cap claims from processed slide files
|
||||
|
||||
Args:
|
||||
folder_path: Path to folder containing processed slide files
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
slide_texts = []
|
||||
|
||||
# Look for JSON files in the processed folder
|
||||
if os.path.exists(folder_path):
|
||||
for filename in os.listdir(folder_path):
|
||||
if filename.endswith('.json'):
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different JSON structures
|
||||
if isinstance(data, list):
|
||||
slide_texts.extend(data)
|
||||
elif isinstance(data, dict) and 'slides' in data:
|
||||
slide_texts.extend(data['slides'])
|
||||
elif isinstance(data, dict) and 'text' in data:
|
||||
slide_texts.append(data)
|
||||
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
print(f"⚠️ Skipping invalid file {filename}: {e}")
|
||||
continue
|
||||
|
||||
if not slide_texts:
|
||||
raise ValueError(f"No valid slide data found in {folder_path}")
|
||||
|
||||
print(f"📁 Loaded {len(slide_texts)} slides from processed folder")
|
||||
return self.validate_from_slides(slide_texts, save_report)
|
||||
|
||||
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
|
||||
"""Generate a summary of validation results"""
|
||||
total_claims = len(validation_results)
|
||||
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
|
||||
inaccurate_claims = total_claims - accurate_claims
|
||||
|
||||
return {
|
||||
'total_claims': total_claims,
|
||||
'accurate_claims': accurate_claims,
|
||||
'inaccurate_claims': inaccurate_claims,
|
||||
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
|
||||
'claims_by_slide': self._group_claims_by_slide(validation_results)
|
||||
}
|
||||
|
||||
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
|
||||
"""Group claims by slide number"""
|
||||
claims_by_slide = {}
|
||||
for result in validation_results:
|
||||
slide_num = result.claim.slide_number
|
||||
if slide_num not in claims_by_slide:
|
||||
claims_by_slide[slide_num] = []
|
||||
claims_by_slide[slide_num].append(result)
|
||||
return claims_by_slide
|
||||
|
||||
|
||||
def validate_market_caps(slide_texts: List[Dict[str, Any]],
|
||||
api_key: Optional[str] = None,
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to validate market cap claims
|
||||
|
||||
Args:
|
||||
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||
api_key: OpenRouter API key (optional)
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
validator = MarketCapValidator(api_key)
|
||||
return validator.validate_from_slides(slide_texts, save_report)
|
||||
|
||||
|
||||
def validate_market_caps_from_file(file_path: str,
|
||||
api_key: Optional[str] = None,
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to validate market cap claims from a file
|
||||
|
||||
Args:
|
||||
file_path: Path to JSON file with slide data
|
||||
api_key: OpenRouter API key (optional)
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
validator = MarketCapValidator(api_key)
|
||||
return validator.validate_from_file(file_path, save_report)
|
||||
|
||||
|
||||
def validate_market_caps_from_processed(folder_path: str = "processed",
|
||||
api_key: Optional[str] = None,
|
||||
save_report: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function to validate market cap claims from processed folder
|
||||
|
||||
Args:
|
||||
folder_path: Path to folder containing processed slide files
|
||||
api_key: OpenRouter API key (optional)
|
||||
save_report: Whether to save the validation report to file
|
||||
|
||||
Returns:
|
||||
Dictionary containing validation results and report
|
||||
"""
|
||||
validator = MarketCapValidator(api_key)
|
||||
return validator.validate_from_processed_folder(folder_path, save_report)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
print("Market Cap Validator - RAG Agent")
|
||||
print("=================================")
|
||||
|
||||
# Try to validate from processed folder
|
||||
try:
|
||||
results = validate_market_caps_from_processed()
|
||||
|
||||
print(f"\n✅ Validation Complete!")
|
||||
print(f"📊 Summary:")
|
||||
print(f" - Total Claims: {results['summary']['total_claims']}")
|
||||
print(f" - Accurate: {results['summary']['accurate_claims']}")
|
||||
print(f" - Inaccurate: {results['summary']['inaccurate_claims']}")
|
||||
print(f" - Accuracy Rate: {results['summary']['accuracy_rate']:.1f}%")
|
||||
|
||||
if results['report_filename']:
|
||||
print(f"📄 Report saved to: {results['report_filename']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
print("\nUsage examples:")
|
||||
print("1. Place slide data JSON files in 'processed/' folder")
|
||||
print("2. Run: python -m modules.market_cap_validator")
|
||||
print("3. Or use the functions directly in your code")
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import base64
|
||||
import fitz # PyMuPDF for PDF processing
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_slides_from_pdf(pdf_path, output_dir, document_name):
|
||||
"""Extract individual slides from PDF as images"""
|
||||
print(f"Extracting slides from PDF: {pdf_path}")
|
||||
|
||||
# Create processed directory structure: ./processed/DocumentName/
|
||||
processed_dir = Path("processed") / document_name
|
||||
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create slides directory within processed directory
|
||||
slides_dir = processed_dir / "slides"
|
||||
slides_dir.mkdir(exist_ok=True)
|
||||
|
||||
slides = []
|
||||
|
||||
try:
|
||||
# Open PDF with PyMuPDF
|
||||
pdf_document = fitz.open(pdf_path)
|
||||
|
||||
for page_num in range(len(pdf_document)):
|
||||
page = pdf_document[page_num]
|
||||
|
||||
# Convert page to image (high resolution)
|
||||
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
|
||||
# Save as PNG with document name prefix
|
||||
slide_filename = f"{document_name}_slide_{page_num + 1:03d}.png"
|
||||
slide_path = slides_dir / slide_filename
|
||||
|
||||
pix.save(str(slide_path))
|
||||
|
||||
# Convert to base64 for API
|
||||
img_data = pix.tobytes("png")
|
||||
img_base64 = base64.b64encode(img_data).decode('utf-8')
|
||||
|
||||
slides.append({
|
||||
'page_num': page_num + 1,
|
||||
'filename': slide_filename,
|
||||
'path': slide_path,
|
||||
'base64': img_base64,
|
||||
'document_name': document_name,
|
||||
'processed_dir': processed_dir
|
||||
})
|
||||
|
||||
print(f" Extracted slide {page_num + 1}")
|
||||
|
||||
pdf_document.close()
|
||||
print(f"✅ Extracted {len(slides)} slides")
|
||||
return slides
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error extracting slides: {e}")
|
||||
return []
|
||||
|
|
@ -0,0 +1,286 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from .client import get_openrouter_client
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarketCapClaim:
|
||||
"""Represents a market cap claim found in slide text"""
|
||||
slide_number: int
|
||||
company_name: str
|
||||
claimed_market_cap: str
|
||||
raw_text: str
|
||||
confidence: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationResult:
|
||||
"""Represents the validation result for a market cap claim"""
|
||||
claim: MarketCapClaim
|
||||
validated_market_cap: Optional[str]
|
||||
validation_source: str
|
||||
confidence_score: float
|
||||
is_accurate: bool
|
||||
discrepancy: Optional[str]
|
||||
rag_search_query: str
|
||||
rag_response: str
|
||||
|
||||
|
||||
class MarketCapRAGAgent:
|
||||
"""
|
||||
RAG Agent for validating market cap claims from pitch deck slides
|
||||
using OpenRouter's web search capabilities
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None):
|
||||
self.client = get_openrouter_client()
|
||||
self.market_cap_patterns = [
|
||||
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||
r'valuation\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||
r'worth\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||
r'valued\s+at\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
||||
r'\$([0-9,.]+[BMK]?)\s+(?:market\s+cap|valuation)',
|
||||
r'(?:market\s+cap|valuation)\s+of\s+\$?([0-9,.]+[BMK]?)'
|
||||
]
|
||||
|
||||
def extract_market_cap_claims(self, slide_texts: List[Dict[str, Any]]) -> List[MarketCapClaim]:
|
||||
"""
|
||||
Extract market cap claims from slide text exports
|
||||
|
||||
Args:
|
||||
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||
|
||||
Returns:
|
||||
List of MarketCapClaim objects
|
||||
"""
|
||||
claims = []
|
||||
|
||||
for slide_data in slide_texts:
|
||||
slide_number = slide_data.get('slide_number', 0)
|
||||
text = slide_data.get('text', '')
|
||||
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Extract company name (usually in first few lines or title)
|
||||
company_name = self._extract_company_name(text)
|
||||
|
||||
# Search for market cap patterns
|
||||
for pattern in self.market_cap_patterns:
|
||||
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
for match in matches:
|
||||
claimed_value = match.group(1)
|
||||
raw_text = match.group(0)
|
||||
|
||||
# Calculate confidence based on context
|
||||
confidence = self._calculate_confidence(text, match.start(), match.end())
|
||||
|
||||
claim = MarketCapClaim(
|
||||
slide_number=slide_number,
|
||||
company_name=company_name,
|
||||
claimed_market_cap=claimed_value,
|
||||
raw_text=raw_text,
|
||||
confidence=confidence
|
||||
)
|
||||
claims.append(claim)
|
||||
|
||||
return claims
|
||||
|
||||
def _extract_company_name(self, text: str) -> str:
|
||||
"""Extract company name from slide text"""
|
||||
lines = text.split('\n')[:5] # Check first 5 lines
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and len(line) > 2 and len(line) < 100:
|
||||
# Skip common slide headers
|
||||
if not any(header in line.lower() for header in ['slide', 'page', 'agenda', 'overview']):
|
||||
return line
|
||||
|
||||
return "Unknown Company"
|
||||
|
||||
def _calculate_confidence(self, text: str, start: int, end: int) -> float:
|
||||
"""Calculate confidence score for a market cap claim"""
|
||||
confidence = 0.5 # Base confidence
|
||||
|
||||
# Extract context around the match
|
||||
context_start = max(0, start - 50)
|
||||
context_end = min(len(text), end + 50)
|
||||
context = text[context_start:context_end].lower()
|
||||
|
||||
# Increase confidence for specific indicators
|
||||
if any(indicator in context for indicator in ['current', 'latest', 'as of', '2024', '2025']):
|
||||
confidence += 0.2
|
||||
|
||||
if any(indicator in context for indicator in ['billion', 'million', 'trillion']):
|
||||
confidence += 0.1
|
||||
|
||||
if 'market cap' in context or 'valuation' in context:
|
||||
confidence += 0.2
|
||||
|
||||
return min(confidence, 1.0)
|
||||
|
||||
def validate_claim_with_rag(self, claim: MarketCapClaim) -> ValidationResult:
|
||||
"""
|
||||
Validate a market cap claim using RAG search
|
||||
|
||||
Args:
|
||||
claim: MarketCapClaim to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult with validation details
|
||||
"""
|
||||
# Construct RAG search query
|
||||
search_query = f"{claim.company_name} current market cap valuation 2024 2025"
|
||||
|
||||
try:
|
||||
# Use OpenRouter with online search enabled
|
||||
response = self.client.chat.completions.create(
|
||||
model="mistralai/mistral-small",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"""
|
||||
Please search for the current market cap or valuation of {claim.company_name}.
|
||||
|
||||
The company claims their market cap is ${claim.claimed_market_cap}.
|
||||
|
||||
Please provide:
|
||||
1. The current market cap/valuation if found
|
||||
2. The source of this information
|
||||
3. Whether the claimed value appears accurate
|
||||
4. Any significant discrepancies
|
||||
|
||||
Focus on recent data from 2024-2025.
|
||||
"""
|
||||
}
|
||||
],
|
||||
max_tokens=800
|
||||
)
|
||||
|
||||
rag_response = response.choices[0].message.content.strip()
|
||||
|
||||
# Parse the response to extract validation details
|
||||
validation_details = self._parse_rag_response(rag_response, claim)
|
||||
|
||||
return ValidationResult(
|
||||
claim=claim,
|
||||
validated_market_cap=validation_details.get('validated_cap'),
|
||||
validation_source=validation_details.get('source', 'RAG Search'),
|
||||
confidence_score=validation_details.get('confidence', 0.5),
|
||||
is_accurate=validation_details.get('is_accurate', False),
|
||||
discrepancy=validation_details.get('discrepancy'),
|
||||
rag_search_query=search_query,
|
||||
rag_response=rag_response
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return ValidationResult(
|
||||
claim=claim,
|
||||
validated_market_cap=None,
|
||||
validation_source="Error",
|
||||
confidence_score=0.0,
|
||||
is_accurate=False,
|
||||
discrepancy=f"RAG search failed: {str(e)}",
|
||||
rag_search_query=search_query,
|
||||
rag_response=f"Error: {str(e)}"
|
||||
)
|
||||
|
||||
def _parse_rag_response(self, response: str, claim: MarketCapClaim) -> Dict[str, Any]:
|
||||
"""Parse RAG response to extract validation details"""
|
||||
details = {
|
||||
'validated_cap': None,
|
||||
'source': 'RAG Search',
|
||||
'confidence': 0.5,
|
||||
'is_accurate': False,
|
||||
'discrepancy': None
|
||||
}
|
||||
|
||||
response_lower = response.lower()
|
||||
|
||||
# Look for market cap values in the response
|
||||
cap_patterns = [
|
||||
r'\$([0-9,.]+[BMK]?)',
|
||||
r'([0-9,.]+[BMK]?)\s+(?:billion|million|trillion)',
|
||||
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)'
|
||||
]
|
||||
|
||||
for pattern in cap_patterns:
|
||||
matches = re.findall(pattern, response_lower)
|
||||
if matches:
|
||||
details['validated_cap'] = matches[0]
|
||||
break
|
||||
|
||||
# Determine accuracy
|
||||
if details['validated_cap']:
|
||||
claimed_normalized = self._normalize_value(claim.claimed_market_cap)
|
||||
validated_normalized = self._normalize_value(details['validated_cap'])
|
||||
|
||||
if claimed_normalized and validated_normalized:
|
||||
# Allow for some variance (within 20%)
|
||||
ratio = min(claimed_normalized, validated_normalized) / max(claimed_normalized, validated_normalized)
|
||||
details['is_accurate'] = ratio > 0.8
|
||||
|
||||
if not details['is_accurate']:
|
||||
details['discrepancy'] = f"Claimed: ${claim.claimed_market_cap}, Found: ${details['validated_cap']}"
|
||||
|
||||
# Extract source information
|
||||
if 'source:' in response_lower or 'according to' in response_lower:
|
||||
source_match = re.search(r'(?:source:|according to)\s*([^\n]+)', response_lower)
|
||||
if source_match:
|
||||
details['source'] = source_match.group(1).strip()
|
||||
|
||||
return details
|
||||
|
||||
def _normalize_value(self, value: str) -> Optional[float]:
|
||||
"""Normalize market cap value to a comparable number"""
|
||||
if not value:
|
||||
return None
|
||||
|
||||
value = value.replace(',', '').upper()
|
||||
|
||||
multiplier = 1
|
||||
if value.endswith('B'):
|
||||
multiplier = 1_000_000_000
|
||||
value = value[:-1]
|
||||
elif value.endswith('M'):
|
||||
multiplier = 1_000_000
|
||||
value = value[:-1]
|
||||
elif value.endswith('K'):
|
||||
multiplier = 1_000
|
||||
value = value[:-1]
|
||||
elif value.endswith('T'):
|
||||
multiplier = 1_000_000_000_000
|
||||
value = value[:-1]
|
||||
|
||||
try:
|
||||
return float(value) * multiplier
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def validate_all_claims(self, slide_texts: List[Dict[str, Any]]) -> List[ValidationResult]:
|
||||
"""
|
||||
Extract and validate all market cap claims from slide texts
|
||||
|
||||
Args:
|
||||
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
||||
|
||||
Returns:
|
||||
List of ValidationResult objects
|
||||
"""
|
||||
claims = self.extract_market_cap_claims(slide_texts)
|
||||
results = []
|
||||
|
||||
print(f"Found {len(claims)} market cap claims to validate...")
|
||||
|
||||
for i, claim in enumerate(claims, 1):
|
||||
print(f" Validating claim {i}/{len(claims)}: {claim.company_name} - ${claim.claimed_market_cap}")
|
||||
result = self.validate_claim_with_rag(claim)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
pdf2image
|
||||
openai
|
||||
requests
|
||||
PyMuPDF
|
||||
docling
|
||||
python-dotenv
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Clean Market Cap Validation CLI
|
||||
|
||||
Validates market cap claims from pitch deck slides using RAG search.
|
||||
Reports are automatically organized in the processed/ directory.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
from modules.document_validator import (
|
||||
validate_document_claims,
|
||||
validate_all_processed_documents
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate market cap claims from pitch deck slides using RAG search"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--file', '-f',
|
||||
help='Path to JSON file containing slide data'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--document', '-d',
|
||||
help='Document name for organized reporting'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
action='store_true',
|
||||
help='Validate all documents in processed/ folder'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-save',
|
||||
action='store_true',
|
||||
help='Do not save validation report to file'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--api-key',
|
||||
help='OpenRouter API key (or set OPENROUTER_API_KEY environment variable)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get API key
|
||||
api_key = args.api_key or os.getenv('OPENROUTER_API_KEY')
|
||||
if not api_key:
|
||||
print("❌ Error: OpenRouter API key required")
|
||||
print(" Set OPENROUTER_API_KEY environment variable or use --api-key")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
print("🔍 Market Cap Validation with RAG Search")
|
||||
print("=========================================")
|
||||
|
||||
if args.all:
|
||||
print("📁 Validating all documents in processed/ folder")
|
||||
results = validate_all_processed_documents(api_key=api_key)
|
||||
|
||||
print(f"\n✅ Validation Complete!")
|
||||
print(f"📊 Processed {len(results)} documents:")
|
||||
|
||||
for doc_name, doc_results in results.items():
|
||||
if 'error' in doc_results:
|
||||
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||
else:
|
||||
summary = doc_results['summary']
|
||||
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||
if doc_results['report_filename']:
|
||||
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||
|
||||
elif args.file:
|
||||
document_name = args.document or "Unknown-Document"
|
||||
print(f"📁 Validating from file: {args.file}")
|
||||
|
||||
import json
|
||||
with open(args.file, 'r', encoding='utf-8') as f:
|
||||
slide_data = json.load(f)
|
||||
|
||||
results = validate_document_claims(
|
||||
document_name,
|
||||
slide_data,
|
||||
api_key=api_key,
|
||||
save_report=not args.no_save
|
||||
)
|
||||
|
||||
# Display results
|
||||
summary = results['summary']
|
||||
print(f"\n✅ Validation Complete!")
|
||||
print(f"📊 Results Summary:")
|
||||
print(f" - Total Claims Found: {summary['total_claims']}")
|
||||
print(f" - Accurate Claims: {summary['accurate_claims']}")
|
||||
print(f" - Inaccurate Claims: {summary['inaccurate_claims']}")
|
||||
print(f" - Accuracy Rate: {summary['accuracy_rate']:.1f}%")
|
||||
|
||||
if results['report_filename']:
|
||||
print(f"📄 Detailed report saved to: {results['report_filename']}")
|
||||
|
||||
else:
|
||||
print("📁 Validating all documents in processed/ folder (default)")
|
||||
results = validate_all_processed_documents(api_key=api_key)
|
||||
|
||||
print(f"\n✅ Validation Complete!")
|
||||
print(f"📊 Processed {len(results)} documents:")
|
||||
|
||||
for doc_name, doc_results in results.items():
|
||||
if 'error' in doc_results:
|
||||
print(f" ❌ {doc_name}: {doc_results['error']}")
|
||||
else:
|
||||
summary = doc_results['summary']
|
||||
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
||||
if doc_results['report_filename']:
|
||||
print(f" 📄 Report: {doc_results['report_filename']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import os
|
||||
from .rag_agent import ValidationResult, MarketCapClaim
|
||||
|
||||
|
||||
class ValidationReportGenerator:
|
||||
"""
|
||||
Generates comprehensive validation reports for market cap claims
|
||||
with slide source tracking
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.report_sections = []
|
||||
|
||||
def generate_report(self, validation_results: List[ValidationResult],
|
||||
slide_texts: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
Generate a comprehensive validation report
|
||||
|
||||
Args:
|
||||
validation_results: List of ValidationResult objects
|
||||
slide_texts: Original slide text data for context
|
||||
|
||||
Returns:
|
||||
Formatted markdown report string
|
||||
"""
|
||||
report = []
|
||||
|
||||
# Header
|
||||
report.append(self._generate_header())
|
||||
|
||||
# Executive Summary
|
||||
report.append(self._generate_executive_summary(validation_results))
|
||||
|
||||
# Detailed Results
|
||||
report.append(self._generate_detailed_results(validation_results))
|
||||
|
||||
# Slide Source Analysis
|
||||
report.append(self._generate_slide_source_analysis(validation_results, slide_texts))
|
||||
|
||||
# RAG Search Details
|
||||
report.append(self._generate_rag_search_details(validation_results))
|
||||
|
||||
# Recommendations
|
||||
report.append(self._generate_recommendations(validation_results))
|
||||
|
||||
return '\n\n'.join(report)
|
||||
|
||||
def _generate_header(self) -> str:
|
||||
"""Generate report header"""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
return f"""# Market Cap Validation Report
|
||||
|
||||
**Generated:** {timestamp}
|
||||
**Report Type:** RAG-Enhanced Validation Analysis
|
||||
**Validation Method:** OpenRouter Web Search Integration
|
||||
|
||||
---
|
||||
"""
|
||||
|
||||
def _generate_executive_summary(self, results: List[ValidationResult]) -> str:
|
||||
"""Generate executive summary section"""
|
||||
total_claims = len(results)
|
||||
accurate_claims = sum(1 for r in results if r.is_accurate)
|
||||
inaccurate_claims = total_claims - accurate_claims
|
||||
high_confidence = sum(1 for r in results if r.confidence_score > 0.7)
|
||||
|
||||
accuracy_rate = (accurate_claims / total_claims * 100) if total_claims > 0 else 0
|
||||
|
||||
return f"""## Executive Summary
|
||||
|
||||
### Key Metrics
|
||||
- **Total Market Cap Claims Analyzed:** {total_claims}
|
||||
- **Claims Validated as Accurate:** {accurate_claims} ({accuracy_rate:.1f}%)
|
||||
- **Claims with Discrepancies:** {inaccurate_claims}
|
||||
- **High Confidence Validations:** {high_confidence}
|
||||
|
||||
### Overall Assessment
|
||||
{'✅ **GOOD** - Most claims appear accurate' if accuracy_rate > 70 else '⚠️ **CAUTION** - Significant discrepancies found' if accuracy_rate < 50 else '🔍 **MIXED** - Some claims require verification'}
|
||||
|
||||
---
|
||||
"""
|
||||
|
||||
def _generate_detailed_results(self, results: List[ValidationResult]) -> str:
|
||||
"""Generate detailed validation results"""
|
||||
if not results:
|
||||
return "## Detailed Results\n\nNo market cap claims found in the analyzed slides.\n\n---"
|
||||
|
||||
report = ["## Detailed Validation Results\n"]
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
status_icon = "✅" if result.is_accurate else "❌" if result.discrepancy else "⚠️"
|
||||
confidence_bar = self._generate_confidence_bar(result.confidence_score)
|
||||
|
||||
report.append(f"""### {status_icon} Claim #{i}: {result.claim.company_name}
|
||||
|
||||
**Slide Source:** Slide {result.claim.slide_number}
|
||||
**Claimed Market Cap:** ${result.claim.claimed_market_cap}
|
||||
**Raw Text:** `{result.claim.raw_text}`
|
||||
**Confidence Score:** {confidence_bar} ({result.confidence_score:.2f})
|
||||
|
||||
**Validation Results:**
|
||||
- **Validated Market Cap:** {result.validated_market_cap or 'Not found'}
|
||||
- **Validation Source:** {result.validation_source}
|
||||
- **Accuracy Status:** {'✅ Accurate' if result.is_accurate else '❌ Inaccurate' if result.discrepancy else '⚠️ Uncertain'}
|
||||
""")
|
||||
|
||||
if result.discrepancy:
|
||||
report.append(f"- **Discrepancy:** {result.discrepancy}")
|
||||
|
||||
report.append(f"- **RAG Search Query:** `{result.rag_search_query}`")
|
||||
report.append("")
|
||||
|
||||
report.append("---")
|
||||
return '\n'.join(report)
|
||||
|
||||
def _generate_slide_source_analysis(self, results: List[ValidationResult],
|
||||
slide_texts: List[Dict[str, Any]]) -> str:
|
||||
"""Generate slide source analysis section"""
|
||||
report = ["## Slide Source Analysis\n"]
|
||||
|
||||
# Group results by slide
|
||||
slide_claims = {}
|
||||
for result in results:
|
||||
slide_num = result.claim.slide_number
|
||||
if slide_num not in slide_claims:
|
||||
slide_claims[slide_num] = []
|
||||
slide_claims[slide_num].append(result)
|
||||
|
||||
# Find slide texts
|
||||
slide_text_map = {s.get('slide_number', 0): s.get('text', '') for s in slide_texts}
|
||||
|
||||
for slide_num in sorted(slide_claims.keys()):
|
||||
claims = slide_claims[slide_num]
|
||||
slide_text = slide_text_map.get(slide_num, 'No text available')
|
||||
|
||||
report.append(f"""### Slide {slide_num} Analysis
|
||||
|
||||
**Claims Found:** {len(claims)}
|
||||
**Slide Text Preview:** {slide_text[:200]}{'...' if len(slide_text) > 200 else ''}
|
||||
|
||||
**Claims Details:**""")
|
||||
|
||||
for claim in claims:
|
||||
status = "✅ Accurate" if any(r.claim == claim and r.is_accurate for r in results) else "❌ Inaccurate"
|
||||
report.append(f"- {claim.company_name}: ${claim.claimed_market_cap} - {status}")
|
||||
|
||||
report.append("")
|
||||
|
||||
report.append("---")
|
||||
return '\n'.join(report)
|
||||
|
||||
def _generate_rag_search_details(self, results: List[ValidationResult]) -> str:
|
||||
"""Generate RAG search details section"""
|
||||
report = ["## RAG Search Details\n"]
|
||||
|
||||
report.append("### Search Methodology")
|
||||
report.append("- **Search Engine:** OpenRouter with Exa integration")
|
||||
report.append("- **Model:** Mistral Small with online search enabled")
|
||||
report.append("- **Search Focus:** Current market cap data (2024-2025)")
|
||||
report.append("- **Validation Threshold:** 80% accuracy tolerance")
|
||||
report.append("")
|
||||
|
||||
report.append("### Search Queries Used")
|
||||
unique_queries = list(set(r.rag_search_query for r in results))
|
||||
for i, query in enumerate(unique_queries, 1):
|
||||
report.append(f"{i}. `{query}`")
|
||||
report.append("")
|
||||
|
||||
report.append("### Sample RAG Responses")
|
||||
for i, result in enumerate(results[:3], 1): # Show first 3 responses
|
||||
report.append(f"""#### Response #{i}: {result.claim.company_name}
|
||||
```
|
||||
{result.rag_response[:300]}{'...' if len(result.rag_response) > 300 else ''}
|
||||
```""")
|
||||
|
||||
report.append("---")
|
||||
return '\n'.join(report)
|
||||
|
||||
def _generate_recommendations(self, results: List[ValidationResult]) -> str:
|
||||
"""Generate recommendations section"""
|
||||
inaccurate_results = [r for r in results if not r.is_accurate and r.discrepancy]
|
||||
high_confidence_results = [r for r in results if r.confidence_score > 0.7]
|
||||
|
||||
report = ["## Recommendations\n"]
|
||||
|
||||
if inaccurate_results:
|
||||
report.append("### ⚠️ Claims Requiring Attention")
|
||||
for result in inaccurate_results:
|
||||
report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - {result.discrepancy}")
|
||||
report.append("")
|
||||
|
||||
if high_confidence_results:
|
||||
report.append("### ✅ High Confidence Validations")
|
||||
report.append("The following claims were validated with high confidence:")
|
||||
for result in high_confidence_results:
|
||||
report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - ${result.claim.claimed_market_cap}")
|
||||
report.append("")
|
||||
|
||||
report.append("### 📋 General Recommendations")
|
||||
report.append("1. **Verify Discrepancies:** Review claims marked as inaccurate with stakeholders")
|
||||
report.append("2. **Update Sources:** Consider updating slide sources with more recent data")
|
||||
report.append("3. **Regular Validation:** Implement periodic validation of financial claims")
|
||||
report.append("4. **Source Attribution:** Always include data sources and dates in financial slides")
|
||||
|
||||
report.append("\n---")
|
||||
report.append("*Report generated by Market Cap RAG Validation Agent*")
|
||||
|
||||
return '\n'.join(report)
|
||||
|
||||
def _generate_confidence_bar(self, confidence: float) -> str:
|
||||
"""Generate a visual confidence bar"""
|
||||
filled = int(confidence * 10)
|
||||
empty = 10 - filled
|
||||
return f"[{'█' * filled}{'░' * empty}]"
|
||||
|
||||
def save_report(self, report: str, filename: str = None, processed_dir: str = "processed") -> str:
|
||||
"""Save report to file"""
|
||||
if filename is None:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"market_cap_validation_report_{timestamp}.md"
|
||||
|
||||
# Create processed directory if it doesn't exist
|
||||
os.makedirs(processed_dir, exist_ok=True)
|
||||
filepath = os.path.join(processed_dir, filename)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(report)
|
||||
|
||||
return filepath
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def process_pitch_deck(pdf_path):
|
||||
"""Working version that bypasses the signature mess"""
|
||||
print(f"Processing: {pdf_path}")
|
||||
|
||||
# Import everything we need
|
||||
from client import get_openrouter_client
|
||||
from pdf_processor import extract_slides_from_pdf
|
||||
from analysis import analyze_slides_batch
|
||||
|
||||
# Extract slides (this works)
|
||||
slides = extract_slides_from_pdf(pdf_path, "processed", Path(pdf_path).stem)
|
||||
print(f"Extracted {len(slides)} slides")
|
||||
|
||||
# Analyze slides (this works)
|
||||
client = get_openrouter_client()
|
||||
analysis_results = analyze_slides_batch(client, slides)
|
||||
print("Analysis complete")
|
||||
|
||||
# Create report manually (bypass the broken create_slide_markdown)
|
||||
markdown_content = f"# Pitch Deck Analysis: {Path(pdf_path).stem}\n\n"
|
||||
|
||||
for i, slide_data in enumerate(slides):
|
||||
slide_num = i + 1
|
||||
analysis = analysis_results.get(slide_num, {})
|
||||
|
||||
markdown_content += f"## Slide {slide_num}\n\n"
|
||||
markdown_content += f"\n\n"
|
||||
|
||||
if analysis:
|
||||
markdown_content += f"**Analysis:**\n{analysis}\n\n"
|
||||
else:
|
||||
markdown_content += "**Analysis:** No analysis available\n\n"
|
||||
|
||||
markdown_content += "---\n\n"
|
||||
|
||||
# Save report
|
||||
output_file = f"processed/{Path(pdf_path).stem}_analysis.md"
|
||||
os.makedirs("processed", exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
print(f"Report saved to: {output_file}")
|
||||
return output_file
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python working_app.py <pdf_path>")
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"Error: File '{pdf_path}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
process_pitch_deck(pdf_path)
|
||||
|
After Width: | Height: | Size: 60 KiB |
|
After Width: | Height: | Size: 94 KiB |
|
After Width: | Height: | Size: 86 KiB |
|
After Width: | Height: | Size: 101 KiB |
|
After Width: | Height: | Size: 110 KiB |
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 1.8 MiB |
|
After Width: | Height: | Size: 2.3 MiB |
|
After Width: | Height: | Size: 1.1 MiB |
|
After Width: | Height: | Size: 91 KiB |
|
After Width: | Height: | Size: 93 KiB |
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 62 KiB |
|
After Width: | Height: | Size: 126 KiB |
|
After Width: | Height: | Size: 327 KiB |
|
After Width: | Height: | Size: 93 KiB |
|
After Width: | Height: | Size: 105 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
|
@ -0,0 +1,6 @@
|
|||
pdf2image
|
||||
openai
|
||||
requests
|
||||
PyMuPDF
|
||||
docling
|
||||
python-dotenv
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Kill any process running on port 3123
|
||||
echo "Killing any existing processes on port 3123..."
|
||||
fuser -k 3123/tcp 2>/dev/null || true
|
||||
|
||||
# Create virtual environment if it doesn't exist
|
||||
if [ ! -d "venv" ]; then
|
||||
echo "Creating virtual environment..."
|
||||
python3 -m venv venv
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
echo "Activating virtual environment..."
|
||||
source venv/bin/activate
|
||||
|
||||
# Verify virtual environment is active
|
||||
echo "Verifying virtual environment..."
|
||||
which python3
|
||||
python3 --version
|
||||
|
||||
# Install dependencies
|
||||
echo "Installing dependencies..."
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Check for help flag
|
||||
if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
|
||||
echo ""
|
||||
echo "Pitch Deck Analysis Application"
|
||||
echo "=============================="
|
||||
echo "Usage: ./start.sh <file_path>"
|
||||
echo "Example: ./start.sh presentation.pdf"
|
||||
echo ""
|
||||
echo "The application will automatically upload the generated report."
|
||||
echo ""
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Verify file exists
|
||||
if [ -z "$1" ]; then
|
||||
echo "Error: No file specified"
|
||||
echo "Usage: ./start.sh <file_path>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "Error: File '$1' not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start the application with immediate feedback
|
||||
echo "Starting pitch deck parser..."
|
||||
echo "Processing file: $1"
|
||||
echo "Python path: $(which python3)"
|
||||
echo "Working directory: $(pwd)"
|
||||
echo "----------------------------------------"
|
||||
|
||||
python3 app.py "$1"
|
||||