technical-screen-2025-10-22/modules/document_validator.py

#!/usr/bin/env python3

"""
Document-specific validator that organizes reports by document in processed directory
"""

import os
import json
from typing import List, Dict, Any, Optional
from .rag_agent import MarketCapRAGAgent
from .validation_report import ValidationReportGenerator


class DocumentValidator:
    """
    Validates financial claims for specific documents with proper directory organization
    """

    def __init__(self, api_key: Optional[str] = None):
        self.rag_agent = MarketCapRAGAgent(api_key)
        self.report_generator = ValidationReportGenerator()

    def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]],
                         save_report: bool = True) -> Dict[str, Any]:
        """
        Validate financial claims for a specific document

        Args:
            document_name: Name of the document (e.g., "Uber-Pitch-Deck")
            slide_texts: List of slide data with 'slide_number' and 'text' keys
            save_report: Whether to save the validation report to file

        Returns:
            Dictionary containing validation results and report
        """
        print(f"🔍 Validating financial claims for: {document_name}")

        # Extract and validate claims
        validation_results = self.rag_agent.validate_all_claims(slide_texts)

        # Generate report
        report = self.report_generator.generate_report(validation_results, slide_texts)

        # Save report in proper directory structure
        report_filename = None
        if save_report:
            # Create document-specific directory
            doc_dir = os.path.join("processed", document_name)
            os.makedirs(doc_dir, exist_ok=True)

            # Save report in document directory
            report_filename = self.report_generator.save_report(
                report,
                f"{document_name}_market_cap_validation.md",
                doc_dir
            )
            print(f"📄 Validation report saved to: {report_filename}")

        # Prepare summary
        summary = self._generate_summary(validation_results)

        return {
            'document_name': document_name,
            'validation_results': validation_results,
            'report': report,
            'report_filename': report_filename,
            'summary': summary
        }

    def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
        """
        Validate all documents in the processed folder

        Args:
            folder_path: Path to processed folder

        Returns:
            Dictionary with results for each document
        """
        results = {}

        if not os.path.exists(folder_path):
            raise ValueError(f"Processed folder not found: {folder_path}")

        # Find all document directories
        for item in os.listdir(folder_path):
            item_path = os.path.join(folder_path, item)
            if os.path.isdir(item_path) and not item.startswith('.'):
                # Look for text content files
                text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]

                if text_files:
                    document_name = item
                    text_file = os.path.join(item_path, text_files[0])

                    print(f"📁 Processing document: {document_name}")

                    # Read text content
                    with open(text_file, 'r', encoding='utf-8') as f:
                        content = f.read()

                    # Convert to slide format
                    slide_texts = [{
                        "slide_number": 1,
                        "text": content
                    }]

                    # Validate document
                    try:
                        doc_results = self.validate_document(document_name, slide_texts)
                        results[document_name] = doc_results
                    except Exception as e:
                        print(f"❌ Error processing {document_name}: {e}")
                        results[document_name] = {'error': str(e)}

        return results

    def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
        """Generate a summary of validation results"""
        total_claims = len(validation_results)
        accurate_claims = sum(1 for r in validation_results if r.is_accurate)
        inaccurate_claims = total_claims - accurate_claims

        return {
            'total_claims': total_claims,
            'accurate_claims': accurate_claims,
            'inaccurate_claims': inaccurate_claims,
            'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
            'claims_by_slide': self._group_claims_by_slide(validation_results)
        }

    def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
        """Group claims by slide number"""
        claims_by_slide = {}
        for result in validation_results:
            slide_num = result.claim.slide_number
            if slide_num not in claims_by_slide:
                claims_by_slide[slide_num] = []
            claims_by_slide[slide_num].append(result)
        return claims_by_slide


def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]],
                           api_key: Optional[str] = None,
                           save_report: bool = True) -> Dict[str, Any]:
    """
    Convenience function to validate claims for a specific document

    Args:
        document_name: Name of the document
        slide_texts: List of slide data
        api_key: OpenRouter API key (optional)
        save_report: Whether to save the validation report to file

    Returns:
        Dictionary containing validation results and report
    """
    validator = DocumentValidator(api_key)
    return validator.validate_document(document_name, slide_texts, save_report)


def validate_all_processed_documents(folder_path: str = "processed",
                                   api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Convenience function to validate all documents in processed folder

    Args:
        folder_path: Path to processed folder
        api_key: OpenRouter API key (optional)

    Returns:
        Dictionary with results for each document
    """
    validator = DocumentValidator(api_key)
    return validator.validate_from_processed_folder(folder_path)


if __name__ == "__main__":
    # Example usage
    print("Document Validator - RAG Agent")
    print("===============================")

    try:
        results = validate_all_processed_documents()

        print(f"\n✅ Validation Complete!")
        print(f"📊 Processed {len(results)} documents:")

        for doc_name, doc_results in results.items():
            if 'error' in doc_results:
                print(f"   ❌ {doc_name}: {doc_results['error']}")
            else:
                summary = doc_results['summary']
                print(f"   ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
                if doc_results['report_filename']:
                    print(f"      📄 Report: {doc_results['report_filename']}")

    except Exception as e:
        print(f"❌ Error: {e}")