technical-screen-2025-10-22/modules/document_validator.py

200 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Document-specific validator that organizes reports by document in processed directory
"""
import os
import json
from typing import List, Dict, Any, Optional
from .rag_agent import MarketCapRAGAgent
from .validation_report import ValidationReportGenerator
class DocumentValidator:
"""
Validates financial claims for specific documents with proper directory organization
"""
def __init__(self, api_key: Optional[str] = None):
self.rag_agent = MarketCapRAGAgent(api_key)
self.report_generator = ValidationReportGenerator()
def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]],
save_report: bool = True) -> Dict[str, Any]:
"""
Validate financial claims for a specific document
Args:
document_name: Name of the document (e.g., "Uber-Pitch-Deck")
slide_texts: List of slide data with 'slide_number' and 'text' keys
save_report: Whether to save the validation report to file
Returns:
Dictionary containing validation results and report
"""
print(f"🔍 Validating financial claims for: {document_name}")
# Extract and validate claims
validation_results = self.rag_agent.validate_all_claims(slide_texts)
# Generate report
report = self.report_generator.generate_report(validation_results, slide_texts)
# Save report in proper directory structure
report_filename = None
if save_report:
# Create document-specific directory
doc_dir = os.path.join("processed", document_name)
os.makedirs(doc_dir, exist_ok=True)
# Save report in document directory
report_filename = self.report_generator.save_report(
report,
f"{document_name}_market_cap_validation.md",
doc_dir
)
print(f"📄 Validation report saved to: {report_filename}")
# Prepare summary
summary = self._generate_summary(validation_results)
return {
'document_name': document_name,
'validation_results': validation_results,
'report': report,
'report_filename': report_filename,
'summary': summary
}
def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
"""
Validate all documents in the processed folder
Args:
folder_path: Path to processed folder
Returns:
Dictionary with results for each document
"""
results = {}
if not os.path.exists(folder_path):
raise ValueError(f"Processed folder not found: {folder_path}")
# Find all document directories
for item in os.listdir(folder_path):
item_path = os.path.join(folder_path, item)
if os.path.isdir(item_path) and not item.startswith('.'):
# Look for text content files
text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
if text_files:
document_name = item
text_file = os.path.join(item_path, text_files[0])
print(f"📁 Processing document: {document_name}")
# Read text content
with open(text_file, 'r', encoding='utf-8') as f:
content = f.read()
# Convert to slide format
slide_texts = [{
"slide_number": 1,
"text": content
}]
# Validate document
try:
doc_results = self.validate_document(document_name, slide_texts)
results[document_name] = doc_results
except Exception as e:
print(f"❌ Error processing {document_name}: {e}")
results[document_name] = {'error': str(e)}
return results
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
"""Generate a summary of validation results"""
total_claims = len(validation_results)
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
inaccurate_claims = total_claims - accurate_claims
return {
'total_claims': total_claims,
'accurate_claims': accurate_claims,
'inaccurate_claims': inaccurate_claims,
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
'claims_by_slide': self._group_claims_by_slide(validation_results)
}
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
"""Group claims by slide number"""
claims_by_slide = {}
for result in validation_results:
slide_num = result.claim.slide_number
if slide_num not in claims_by_slide:
claims_by_slide[slide_num] = []
claims_by_slide[slide_num].append(result)
return claims_by_slide
def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]],
api_key: Optional[str] = None,
save_report: bool = True) -> Dict[str, Any]:
"""
Convenience function to validate claims for a specific document
Args:
document_name: Name of the document
slide_texts: List of slide data
api_key: OpenRouter API key (optional)
save_report: Whether to save the validation report to file
Returns:
Dictionary containing validation results and report
"""
validator = DocumentValidator(api_key)
return validator.validate_document(document_name, slide_texts, save_report)
def validate_all_processed_documents(folder_path: str = "processed",
api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Convenience function to validate all documents in processed folder
Args:
folder_path: Path to processed folder
api_key: OpenRouter API key (optional)
Returns:
Dictionary with results for each document
"""
validator = DocumentValidator(api_key)
return validator.validate_from_processed_folder(folder_path)
if __name__ == "__main__":
# Example usage
print("Document Validator - RAG Agent")
print("===============================")
try:
results = validate_all_processed_documents()
print(f"\n✅ Validation Complete!")
print(f"📊 Processed {len(results)} documents:")
for doc_name, doc_results in results.items():
if 'error' in doc_results:
print(f"{doc_name}: {doc_results['error']}")
else:
summary = doc_results['summary']
print(f"{doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
if doc_results['report_filename']:
print(f" 📄 Report: {doc_results['report_filename']}")
except Exception as e:
print(f"❌ Error: {e}")