#!/usr/bin/env python3 """ Document-specific validator that organizes reports by document in processed directory """ import os import json from typing import List, Dict, Any, Optional from .rag_agent import MarketCapRAGAgent from .validation_report import ValidationReportGenerator class DocumentValidator: """ Validates financial claims for specific documents with proper directory organization """ def __init__(self, api_key: Optional[str] = None): self.rag_agent = MarketCapRAGAgent(api_key) self.report_generator = ValidationReportGenerator() def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]], save_report: bool = True) -> Dict[str, Any]: """ Validate financial claims for a specific document Args: document_name: Name of the document (e.g., "Uber-Pitch-Deck") slide_texts: List of slide data with 'slide_number' and 'text' keys save_report: Whether to save the validation report to file Returns: Dictionary containing validation results and report """ print(f"šŸ” Validating financial claims for: {document_name}") # Extract and validate claims validation_results = self.rag_agent.validate_all_claims(slide_texts) # Generate report report = self.report_generator.generate_report(validation_results, slide_texts) # Save report in proper directory structure report_filename = None if save_report: # Create document-specific directory doc_dir = os.path.join("processed", document_name) os.makedirs(doc_dir, exist_ok=True) # Save report in document directory report_filename = self.report_generator.save_report( report, f"{document_name}_market_cap_validation.md", doc_dir ) print(f"šŸ“„ Validation report saved to: {report_filename}") # Prepare summary summary = self._generate_summary(validation_results) return { 'document_name': document_name, 'validation_results': validation_results, 'report': report, 'report_filename': report_filename, 'summary': summary } def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]: """ Validate all documents in the processed folder Args: folder_path: Path to processed folder Returns: Dictionary with results for each document """ results = {} if not os.path.exists(folder_path): raise ValueError(f"Processed folder not found: {folder_path}") # Find all document directories for item in os.listdir(folder_path): item_path = os.path.join(folder_path, item) if os.path.isdir(item_path) and not item.startswith('.'): # Look for text content files text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')] if text_files: document_name = item text_file = os.path.join(item_path, text_files[0]) print(f"šŸ“ Processing document: {document_name}") # Read text content with open(text_file, 'r', encoding='utf-8') as f: content = f.read() # Convert to slide format slide_texts = [{ "slide_number": 1, "text": content }] # Validate document try: doc_results = self.validate_document(document_name, slide_texts) results[document_name] = doc_results except Exception as e: print(f"āŒ Error processing {document_name}: {e}") results[document_name] = {'error': str(e)} return results def _generate_summary(self, validation_results: List) -> Dict[str, Any]: """Generate a summary of validation results""" total_claims = len(validation_results) accurate_claims = sum(1 for r in validation_results if r.is_accurate) inaccurate_claims = total_claims - accurate_claims return { 'total_claims': total_claims, 'accurate_claims': accurate_claims, 'inaccurate_claims': inaccurate_claims, 'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0, 'claims_by_slide': self._group_claims_by_slide(validation_results) } def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]: """Group claims by slide number""" claims_by_slide = {} for result in validation_results: slide_num = result.claim.slide_number if slide_num not in claims_by_slide: claims_by_slide[slide_num] = [] claims_by_slide[slide_num].append(result) return claims_by_slide def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]], api_key: Optional[str] = None, save_report: bool = True) -> Dict[str, Any]: """ Convenience function to validate claims for a specific document Args: document_name: Name of the document slide_texts: List of slide data api_key: OpenRouter API key (optional) save_report: Whether to save the validation report to file Returns: Dictionary containing validation results and report """ validator = DocumentValidator(api_key) return validator.validate_document(document_name, slide_texts, save_report) def validate_all_processed_documents(folder_path: str = "processed", api_key: Optional[str] = None) -> Dict[str, Any]: """ Convenience function to validate all documents in processed folder Args: folder_path: Path to processed folder api_key: OpenRouter API key (optional) Returns: Dictionary with results for each document """ validator = DocumentValidator(api_key) return validator.validate_from_processed_folder(folder_path) if __name__ == "__main__": # Example usage print("Document Validator - RAG Agent") print("===============================") try: results = validate_all_processed_documents() print(f"\nāœ… Validation Complete!") print(f"šŸ“Š Processed {len(results)} documents:") for doc_name, doc_results in results.items(): if 'error' in doc_results: print(f" āŒ {doc_name}: {doc_results['error']}") else: summary = doc_results['summary'] print(f" āœ… {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate") if doc_results['report_filename']: print(f" šŸ“„ Report: {doc_results['report_filename']}") except Exception as e: print(f"āŒ Error: {e}")