#!/usr/bin/env python3 from typing import List, Dict, Any from datetime import datetime import os from .rag_agent import ValidationResult, MarketCapClaim class ValidationReportGenerator: """ Generates comprehensive validation reports for market cap claims with slide source tracking """ def __init__(self): self.report_sections = [] def generate_report(self, validation_results: List[ValidationResult], slide_texts: List[Dict[str, Any]]) -> str: """ Generate a comprehensive validation report Args: validation_results: List of ValidationResult objects slide_texts: Original slide text data for context Returns: Formatted markdown report string """ report = [] # Header report.append(self._generate_header()) # Executive Summary report.append(self._generate_executive_summary(validation_results)) # Detailed Results report.append(self._generate_detailed_results(validation_results)) # Slide Source Analysis report.append(self._generate_slide_source_analysis(validation_results, slide_texts)) # RAG Search Details report.append(self._generate_rag_search_details(validation_results)) # Recommendations report.append(self._generate_recommendations(validation_results)) return '\n\n'.join(report) def _generate_header(self) -> str: """Generate report header""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return f"""# Market Cap Validation Report **Generated:** {timestamp} **Report Type:** RAG-Enhanced Validation Analysis **Validation Method:** OpenRouter Web Search Integration --- """ def _generate_executive_summary(self, results: List[ValidationResult]) -> str: """Generate executive summary section""" total_claims = len(results) accurate_claims = sum(1 for r in results if r.is_accurate) inaccurate_claims = total_claims - accurate_claims high_confidence = sum(1 for r in results if r.confidence_score > 0.7) accuracy_rate = (accurate_claims / total_claims * 100) if total_claims > 0 else 0 return f"""## Executive Summary ### Key Metrics - **Total Market Cap Claims Analyzed:** {total_claims} - **Claims Validated as Accurate:** {accurate_claims} ({accuracy_rate:.1f}%) - **Claims with Discrepancies:** {inaccurate_claims} - **High Confidence Validations:** {high_confidence} ### Overall Assessment {'✅ **GOOD** - Most claims appear accurate' if accuracy_rate > 70 else '⚠️ **CAUTION** - Significant discrepancies found' if accuracy_rate < 50 else '🔍 **MIXED** - Some claims require verification'} --- """ def _generate_detailed_results(self, results: List[ValidationResult]) -> str: """Generate detailed validation results""" if not results: return "## Detailed Results\n\nNo market cap claims found in the analyzed slides.\n\n---" report = ["## Detailed Validation Results\n"] for i, result in enumerate(results, 1): status_icon = "✅" if result.is_accurate else "❌" if result.discrepancy else "⚠️" confidence_bar = self._generate_confidence_bar(result.confidence_score) report.append(f"""### {status_icon} Claim #{i}: {result.claim.company_name} **Slide Source:** Slide {result.claim.slide_number} **Claimed Market Cap:** ${result.claim.claimed_market_cap} **Raw Text:** `{result.claim.raw_text}` **Confidence Score:** {confidence_bar} ({result.confidence_score:.2f}) **Validation Results:** - **Validated Market Cap:** {result.validated_market_cap or 'Not found'} - **Validation Source:** {result.validation_source} - **Accuracy Status:** {'✅ Accurate' if result.is_accurate else '❌ Inaccurate' if result.discrepancy else '⚠️ Uncertain'} """) if result.discrepancy: report.append(f"- **Discrepancy:** {result.discrepancy}") report.append(f"- **RAG Search Query:** `{result.rag_search_query}`") report.append("") report.append("---") return '\n'.join(report) def _generate_slide_source_analysis(self, results: List[ValidationResult], slide_texts: List[Dict[str, Any]]) -> str: """Generate slide source analysis section""" report = ["## Slide Source Analysis\n"] # Group results by slide slide_claims = {} for result in results: slide_num = result.claim.slide_number if slide_num not in slide_claims: slide_claims[slide_num] = [] slide_claims[slide_num].append(result) # Find slide texts slide_text_map = {s.get('slide_number', 0): s.get('text', '') for s in slide_texts} for slide_num in sorted(slide_claims.keys()): claims = slide_claims[slide_num] slide_text = slide_text_map.get(slide_num, 'No text available') report.append(f"""### Slide {slide_num} Analysis **Claims Found:** {len(claims)} **Slide Text Preview:** {slide_text[:200]}{'...' if len(slide_text) > 200 else ''} **Claims Details:**""") for claim in claims: status = "✅ Accurate" if any(r.claim == claim and r.is_accurate for r in results) else "❌ Inaccurate" report.append(f"- {claim.company_name}: ${claim.claimed_market_cap} - {status}") report.append("") report.append("---") return '\n'.join(report) def _generate_rag_search_details(self, results: List[ValidationResult]) -> str: """Generate RAG search details section""" report = ["## RAG Search Details\n"] report.append("### Search Methodology") report.append("- **Search Engine:** OpenRouter with Exa integration") report.append("- **Model:** Mistral Small with online search enabled") report.append("- **Search Focus:** Current market cap data (2024-2025)") report.append("- **Validation Threshold:** 80% accuracy tolerance") report.append("") report.append("### Search Queries Used") unique_queries = list(set(r.rag_search_query for r in results)) for i, query in enumerate(unique_queries, 1): report.append(f"{i}. `{query}`") report.append("") report.append("### Sample RAG Responses") for i, result in enumerate(results[:3], 1): # Show first 3 responses report.append(f"""#### Response #{i}: {result.claim.company_name} ``` {result.rag_response[:300]}{'...' if len(result.rag_response) > 300 else ''} ```""") report.append("---") return '\n'.join(report) def _generate_recommendations(self, results: List[ValidationResult]) -> str: """Generate recommendations section""" inaccurate_results = [r for r in results if not r.is_accurate and r.discrepancy] high_confidence_results = [r for r in results if r.confidence_score > 0.7] report = ["## Recommendations\n"] if inaccurate_results: report.append("### ⚠️ Claims Requiring Attention") for result in inaccurate_results: report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - {result.discrepancy}") report.append("") if high_confidence_results: report.append("### ✅ High Confidence Validations") report.append("The following claims were validated with high confidence:") for result in high_confidence_results: report.append(f"- **Slide {result.claim.slide_number}:** {result.claim.company_name} - ${result.claim.claimed_market_cap}") report.append("") report.append("### 📋 General Recommendations") report.append("1. **Verify Discrepancies:** Review claims marked as inaccurate with stakeholders") report.append("2. **Update Sources:** Consider updating slide sources with more recent data") report.append("3. **Regular Validation:** Implement periodic validation of financial claims") report.append("4. **Source Attribution:** Always include data sources and dates in financial slides") report.append("\n---") report.append("*Report generated by Market Cap RAG Validation Agent*") return '\n'.join(report) def _generate_confidence_bar(self, confidence: float) -> str: """Generate a visual confidence bar""" filled = int(confidence * 10) empty = 10 - filled return f"[{'█' * filled}{'░' * empty}]" def save_report(self, report: str, filename: str = None, processed_dir: str = "processed") -> str: """Save report to file""" if filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"market_cap_validation_report_{timestamp}.md" # Create processed directory if it doesn't exist os.makedirs(processed_dir, exist_ok=True) filepath = os.path.join(processed_dir, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(report) return filepath