200 lines
7.4 KiB
Python
200 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Document-specific validator that organizes reports by document in processed directory
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
from typing import List, Dict, Any, Optional
|
|
from .rag_agent import MarketCapRAGAgent
|
|
from .validation_report import ValidationReportGenerator
|
|
|
|
|
|
class DocumentValidator:
|
|
"""
|
|
Validates financial claims for specific documents with proper directory organization
|
|
"""
|
|
|
|
def __init__(self, api_key: Optional[str] = None):
|
|
self.rag_agent = MarketCapRAGAgent(api_key)
|
|
self.report_generator = ValidationReportGenerator()
|
|
|
|
def validate_document(self, document_name: str, slide_texts: List[Dict[str, Any]],
|
|
save_report: bool = True) -> Dict[str, Any]:
|
|
"""
|
|
Validate financial claims for a specific document
|
|
|
|
Args:
|
|
document_name: Name of the document (e.g., "Uber-Pitch-Deck")
|
|
slide_texts: List of slide data with 'slide_number' and 'text' keys
|
|
save_report: Whether to save the validation report to file
|
|
|
|
Returns:
|
|
Dictionary containing validation results and report
|
|
"""
|
|
print(f"🔍 Validating financial claims for: {document_name}")
|
|
|
|
# Extract and validate claims
|
|
validation_results = self.rag_agent.validate_all_claims(slide_texts)
|
|
|
|
# Generate report
|
|
report = self.report_generator.generate_report(validation_results, slide_texts)
|
|
|
|
# Save report in proper directory structure
|
|
report_filename = None
|
|
if save_report:
|
|
# Create document-specific directory
|
|
doc_dir = os.path.join("processed", document_name)
|
|
os.makedirs(doc_dir, exist_ok=True)
|
|
|
|
# Save report in document directory
|
|
report_filename = self.report_generator.save_report(
|
|
report,
|
|
f"{document_name}_market_cap_validation.md",
|
|
doc_dir
|
|
)
|
|
print(f"📄 Validation report saved to: {report_filename}")
|
|
|
|
# Prepare summary
|
|
summary = self._generate_summary(validation_results)
|
|
|
|
return {
|
|
'document_name': document_name,
|
|
'validation_results': validation_results,
|
|
'report': report,
|
|
'report_filename': report_filename,
|
|
'summary': summary
|
|
}
|
|
|
|
def validate_from_processed_folder(self, folder_path: str = "processed") -> Dict[str, Any]:
|
|
"""
|
|
Validate all documents in the processed folder
|
|
|
|
Args:
|
|
folder_path: Path to processed folder
|
|
|
|
Returns:
|
|
Dictionary with results for each document
|
|
"""
|
|
results = {}
|
|
|
|
if not os.path.exists(folder_path):
|
|
raise ValueError(f"Processed folder not found: {folder_path}")
|
|
|
|
# Find all document directories
|
|
for item in os.listdir(folder_path):
|
|
item_path = os.path.join(folder_path, item)
|
|
if os.path.isdir(item_path) and not item.startswith('.'):
|
|
# Look for text content files
|
|
text_files = [f for f in os.listdir(item_path) if f.endswith('_text_content.md')]
|
|
|
|
if text_files:
|
|
document_name = item
|
|
text_file = os.path.join(item_path, text_files[0])
|
|
|
|
print(f"📁 Processing document: {document_name}")
|
|
|
|
# Read text content
|
|
with open(text_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Convert to slide format
|
|
slide_texts = [{
|
|
"slide_number": 1,
|
|
"text": content
|
|
}]
|
|
|
|
# Validate document
|
|
try:
|
|
doc_results = self.validate_document(document_name, slide_texts)
|
|
results[document_name] = doc_results
|
|
except Exception as e:
|
|
print(f"❌ Error processing {document_name}: {e}")
|
|
results[document_name] = {'error': str(e)}
|
|
|
|
return results
|
|
|
|
def _generate_summary(self, validation_results: List) -> Dict[str, Any]:
|
|
"""Generate a summary of validation results"""
|
|
total_claims = len(validation_results)
|
|
accurate_claims = sum(1 for r in validation_results if r.is_accurate)
|
|
inaccurate_claims = total_claims - accurate_claims
|
|
|
|
return {
|
|
'total_claims': total_claims,
|
|
'accurate_claims': accurate_claims,
|
|
'inaccurate_claims': inaccurate_claims,
|
|
'accuracy_rate': (accurate_claims / total_claims * 100) if total_claims > 0 else 0,
|
|
'claims_by_slide': self._group_claims_by_slide(validation_results)
|
|
}
|
|
|
|
def _group_claims_by_slide(self, validation_results: List) -> Dict[int, List]:
|
|
"""Group claims by slide number"""
|
|
claims_by_slide = {}
|
|
for result in validation_results:
|
|
slide_num = result.claim.slide_number
|
|
if slide_num not in claims_by_slide:
|
|
claims_by_slide[slide_num] = []
|
|
claims_by_slide[slide_num].append(result)
|
|
return claims_by_slide
|
|
|
|
|
|
def validate_document_claims(document_name: str, slide_texts: List[Dict[str, Any]],
|
|
api_key: Optional[str] = None,
|
|
save_report: bool = True) -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to validate claims for a specific document
|
|
|
|
Args:
|
|
document_name: Name of the document
|
|
slide_texts: List of slide data
|
|
api_key: OpenRouter API key (optional)
|
|
save_report: Whether to save the validation report to file
|
|
|
|
Returns:
|
|
Dictionary containing validation results and report
|
|
"""
|
|
validator = DocumentValidator(api_key)
|
|
return validator.validate_document(document_name, slide_texts, save_report)
|
|
|
|
|
|
def validate_all_processed_documents(folder_path: str = "processed",
|
|
api_key: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Convenience function to validate all documents in processed folder
|
|
|
|
Args:
|
|
folder_path: Path to processed folder
|
|
api_key: OpenRouter API key (optional)
|
|
|
|
Returns:
|
|
Dictionary with results for each document
|
|
"""
|
|
validator = DocumentValidator(api_key)
|
|
return validator.validate_from_processed_folder(folder_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage
|
|
print("Document Validator - RAG Agent")
|
|
print("===============================")
|
|
|
|
try:
|
|
results = validate_all_processed_documents()
|
|
|
|
print(f"\n✅ Validation Complete!")
|
|
print(f"📊 Processed {len(results)} documents:")
|
|
|
|
for doc_name, doc_results in results.items():
|
|
if 'error' in doc_results:
|
|
print(f" ❌ {doc_name}: {doc_results['error']}")
|
|
else:
|
|
summary = doc_results['summary']
|
|
print(f" ✅ {doc_name}: {summary['total_claims']} claims, {summary['accuracy_rate']:.1f}% accurate")
|
|
if doc_results['report_filename']:
|
|
print(f" 📄 Report: {doc_results['report_filename']}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|