287 lines
10 KiB
Python
287 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import re
|
|
import json
|
|
from typing import List, Dict, Any, Optional
|
|
from dataclasses import dataclass
|
|
from .client import get_openrouter_client
|
|
|
|
|
|
@dataclass
|
|
class MarketCapClaim:
|
|
"""Represents a market cap claim found in slide text"""
|
|
slide_number: int
|
|
company_name: str
|
|
claimed_market_cap: str
|
|
raw_text: str
|
|
confidence: float
|
|
|
|
|
|
@dataclass
|
|
class ValidationResult:
|
|
"""Represents the validation result for a market cap claim"""
|
|
claim: MarketCapClaim
|
|
validated_market_cap: Optional[str]
|
|
validation_source: str
|
|
confidence_score: float
|
|
is_accurate: bool
|
|
discrepancy: Optional[str]
|
|
rag_search_query: str
|
|
rag_response: str
|
|
|
|
|
|
class MarketCapRAGAgent:
|
|
"""
|
|
RAG Agent for validating market cap claims from pitch deck slides
|
|
using OpenRouter's web search capabilities
|
|
"""
|
|
|
|
def __init__(self, api_key: Optional[str] = None):
|
|
self.client = get_openrouter_client()
|
|
self.market_cap_patterns = [
|
|
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
|
r'valuation\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
|
r'worth\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
|
r'valued\s+at\s*:?\s*\$?([0-9,.]+[BMK]?)',
|
|
r'\$([0-9,.]+[BMK]?)\s+(?:market\s+cap|valuation)',
|
|
r'(?:market\s+cap|valuation)\s+of\s+\$?([0-9,.]+[BMK]?)'
|
|
]
|
|
|
|
def extract_market_cap_claims(self, slide_texts: List[Dict[str, Any]]) -> List[MarketCapClaim]:
|
|
"""
|
|
Extract market cap claims from slide text exports
|
|
|
|
Args:
|
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
|
|
|
Returns:
|
|
List of MarketCapClaim objects
|
|
"""
|
|
claims = []
|
|
|
|
for slide_data in slide_texts:
|
|
slide_number = slide_data.get('slide_number', 0)
|
|
text = slide_data.get('text', '')
|
|
|
|
if not text:
|
|
continue
|
|
|
|
# Extract company name (usually in first few lines or title)
|
|
company_name = self._extract_company_name(text)
|
|
|
|
# Search for market cap patterns
|
|
for pattern in self.market_cap_patterns:
|
|
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
|
|
for match in matches:
|
|
claimed_value = match.group(1)
|
|
raw_text = match.group(0)
|
|
|
|
# Calculate confidence based on context
|
|
confidence = self._calculate_confidence(text, match.start(), match.end())
|
|
|
|
claim = MarketCapClaim(
|
|
slide_number=slide_number,
|
|
company_name=company_name,
|
|
claimed_market_cap=claimed_value,
|
|
raw_text=raw_text,
|
|
confidence=confidence
|
|
)
|
|
claims.append(claim)
|
|
|
|
return claims
|
|
|
|
def _extract_company_name(self, text: str) -> str:
|
|
"""Extract company name from slide text"""
|
|
lines = text.split('\n')[:5] # Check first 5 lines
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and len(line) > 2 and len(line) < 100:
|
|
# Skip common slide headers
|
|
if not any(header in line.lower() for header in ['slide', 'page', 'agenda', 'overview']):
|
|
return line
|
|
|
|
return "Unknown Company"
|
|
|
|
def _calculate_confidence(self, text: str, start: int, end: int) -> float:
|
|
"""Calculate confidence score for a market cap claim"""
|
|
confidence = 0.5 # Base confidence
|
|
|
|
# Extract context around the match
|
|
context_start = max(0, start - 50)
|
|
context_end = min(len(text), end + 50)
|
|
context = text[context_start:context_end].lower()
|
|
|
|
# Increase confidence for specific indicators
|
|
if any(indicator in context for indicator in ['current', 'latest', 'as of', '2024', '2025']):
|
|
confidence += 0.2
|
|
|
|
if any(indicator in context for indicator in ['billion', 'million', 'trillion']):
|
|
confidence += 0.1
|
|
|
|
if 'market cap' in context or 'valuation' in context:
|
|
confidence += 0.2
|
|
|
|
return min(confidence, 1.0)
|
|
|
|
def validate_claim_with_rag(self, claim: MarketCapClaim) -> ValidationResult:
|
|
"""
|
|
Validate a market cap claim using RAG search
|
|
|
|
Args:
|
|
claim: MarketCapClaim to validate
|
|
|
|
Returns:
|
|
ValidationResult with validation details
|
|
"""
|
|
# Construct RAG search query
|
|
search_query = f"{claim.company_name} current market cap valuation 2024 2025"
|
|
|
|
try:
|
|
# Use OpenRouter with online search enabled
|
|
response = self.client.chat.completions.create(
|
|
model="mistralai/mistral-small",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": f"""
|
|
Please search for the current market cap or valuation of {claim.company_name}.
|
|
|
|
The company claims their market cap is ${claim.claimed_market_cap}.
|
|
|
|
Please provide:
|
|
1. The current market cap/valuation if found
|
|
2. The source of this information
|
|
3. Whether the claimed value appears accurate
|
|
4. Any significant discrepancies
|
|
|
|
Focus on recent data from 2024-2025.
|
|
"""
|
|
}
|
|
],
|
|
max_tokens=800
|
|
)
|
|
|
|
rag_response = response.choices[0].message.content.strip()
|
|
|
|
# Parse the response to extract validation details
|
|
validation_details = self._parse_rag_response(rag_response, claim)
|
|
|
|
return ValidationResult(
|
|
claim=claim,
|
|
validated_market_cap=validation_details.get('validated_cap'),
|
|
validation_source=validation_details.get('source', 'RAG Search'),
|
|
confidence_score=validation_details.get('confidence', 0.5),
|
|
is_accurate=validation_details.get('is_accurate', False),
|
|
discrepancy=validation_details.get('discrepancy'),
|
|
rag_search_query=search_query,
|
|
rag_response=rag_response
|
|
)
|
|
|
|
except Exception as e:
|
|
return ValidationResult(
|
|
claim=claim,
|
|
validated_market_cap=None,
|
|
validation_source="Error",
|
|
confidence_score=0.0,
|
|
is_accurate=False,
|
|
discrepancy=f"RAG search failed: {str(e)}",
|
|
rag_search_query=search_query,
|
|
rag_response=f"Error: {str(e)}"
|
|
)
|
|
|
|
def _parse_rag_response(self, response: str, claim: MarketCapClaim) -> Dict[str, Any]:
|
|
"""Parse RAG response to extract validation details"""
|
|
details = {
|
|
'validated_cap': None,
|
|
'source': 'RAG Search',
|
|
'confidence': 0.5,
|
|
'is_accurate': False,
|
|
'discrepancy': None
|
|
}
|
|
|
|
response_lower = response.lower()
|
|
|
|
# Look for market cap values in the response
|
|
cap_patterns = [
|
|
r'\$([0-9,.]+[BMK]?)',
|
|
r'([0-9,.]+[BMK]?)\s+(?:billion|million|trillion)',
|
|
r'market\s+cap(?:italization)?\s*:?\s*\$?([0-9,.]+[BMK]?)'
|
|
]
|
|
|
|
for pattern in cap_patterns:
|
|
matches = re.findall(pattern, response_lower)
|
|
if matches:
|
|
details['validated_cap'] = matches[0]
|
|
break
|
|
|
|
# Determine accuracy
|
|
if details['validated_cap']:
|
|
claimed_normalized = self._normalize_value(claim.claimed_market_cap)
|
|
validated_normalized = self._normalize_value(details['validated_cap'])
|
|
|
|
if claimed_normalized and validated_normalized:
|
|
# Allow for some variance (within 20%)
|
|
ratio = min(claimed_normalized, validated_normalized) / max(claimed_normalized, validated_normalized)
|
|
details['is_accurate'] = ratio > 0.8
|
|
|
|
if not details['is_accurate']:
|
|
details['discrepancy'] = f"Claimed: ${claim.claimed_market_cap}, Found: ${details['validated_cap']}"
|
|
|
|
# Extract source information
|
|
if 'source:' in response_lower or 'according to' in response_lower:
|
|
source_match = re.search(r'(?:source:|according to)\s*([^\n]+)', response_lower)
|
|
if source_match:
|
|
details['source'] = source_match.group(1).strip()
|
|
|
|
return details
|
|
|
|
def _normalize_value(self, value: str) -> Optional[float]:
|
|
"""Normalize market cap value to a comparable number"""
|
|
if not value:
|
|
return None
|
|
|
|
value = value.replace(',', '').upper()
|
|
|
|
multiplier = 1
|
|
if value.endswith('B'):
|
|
multiplier = 1_000_000_000
|
|
value = value[:-1]
|
|
elif value.endswith('M'):
|
|
multiplier = 1_000_000
|
|
value = value[:-1]
|
|
elif value.endswith('K'):
|
|
multiplier = 1_000
|
|
value = value[:-1]
|
|
elif value.endswith('T'):
|
|
multiplier = 1_000_000_000_000
|
|
value = value[:-1]
|
|
|
|
try:
|
|
return float(value) * multiplier
|
|
except ValueError:
|
|
return None
|
|
|
|
def validate_all_claims(self, slide_texts: List[Dict[str, Any]]) -> List[ValidationResult]:
|
|
"""
|
|
Extract and validate all market cap claims from slide texts
|
|
|
|
Args:
|
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
|
|
|
Returns:
|
|
List of ValidationResult objects
|
|
"""
|
|
claims = self.extract_market_cap_claims(slide_texts)
|
|
results = []
|
|
|
|
print(f"Found {len(claims)} market cap claims to validate...")
|
|
|
|
for i, claim in enumerate(claims, 1):
|
|
print(f" Validating claim {i}/{len(claims)}: {claim.company_name} - ${claim.claimed_market_cap}")
|
|
result = self.validate_claim_with_rag(claim)
|
|
results.append(result)
|
|
|
|
return results
|