/** * Validate multiple Google Alert queries from markdown files * Uses Playwright with human-like behavior to test queries */ import { chromium } from 'playwright'; import { readFile } from 'fs/promises'; import { validateQuery } from './playwright-scraper.js'; /** * Parse alert queries from markdown file */ async function parseAlertsFromMarkdown(filePath) { const content = await readFile(filePath, 'utf-8'); const lines = content.split('\n'); const alerts = []; let currentAlert = null; let inCodeBlock = false; let queryLines = []; for (const line of lines) { // Detect alert name if (line.startsWith('**Alert Name:**') || line.startsWith('## ')) { if (currentAlert && queryLines.length > 0) { currentAlert.query = queryLines.join('\n').trim(); alerts.push(currentAlert); } let name = ''; if (line.startsWith('**Alert Name:**')) { const match = line.match(/`([^`]+)`/); name = match ? match[1] : line.split('**Alert Name:**')[1].trim(); } else if (line.startsWith('## ')) { name = line.replace(/^## /, '').trim(); } currentAlert = { name, query: '' }; queryLines = []; continue; } // Detect code blocks containing queries if (line.trim() === '```') { if (!inCodeBlock && currentAlert) { inCodeBlock = true; queryLines = []; } else if (inCodeBlock) { inCodeBlock = false; } continue; } // Collect query lines if (inCodeBlock) { queryLines.push(line); } } // Add last alert if (currentAlert && queryLines.length > 0) { currentAlert.query = queryLines.join('\n').trim(); alerts.push(currentAlert); } // Clean up ALERT_NAME markers from queries (they cause false negatives) alerts.forEach(alert => { alert.query = alert.query.replace(/-"ALERT_NAME:[^"]*"\s*/g, ''); }); return alerts; } /** * Create detailed notes for a single alert test */ function createAlertNotes(alertName, result) { const lines = []; const timestamp = new Date().toISOString(); lines.push(`## ${alertName}`); lines.push(`**Tested:** ${timestamp}`); lines.push(`**Query:** \`${result.query}\``); lines.push(''); if (result.success) { lines.push(`**Status:** ✅ Success`); lines.push(`**Total Results:** ${result.resultCount}`); lines.push(`**Recent Results:** ${result.recentCount || 0} (today/this week)`); lines.push(`**Relevant Results:** ${result.relevantCount || 0}`); lines.push(`**Avg Recency Score:** ${result.avgRecencyScore || 0}/10`); lines.push(`**Avg Relevance Score:** ${result.avgRelevanceScore || 0}`); lines.push(''); if (result.recencyDist) { lines.push('**Recency Breakdown:**'); lines.push(`- Today: ${result.recencyDist.today}`); lines.push(`- This Week: ${result.recencyDist.this_week}`); lines.push(`- This Month: ${result.recencyDist.this_month}`); lines.push(`- Older: ${result.recencyDist.older}`); lines.push(`- Unknown: ${result.recencyDist.unknown}`); lines.push(''); } // Add tuning recommendations lines.push('**Analysis:**'); if (result.recentCount === 0) { lines.push('- ⚠️ No recent results - consider broadening keywords or checking if topic is active'); } else if (result.recentCount >= 3) { lines.push('- ✅ Good number of recent results'); } if (result.relevantCount < result.resultCount / 2) { lines.push('- ⚠️ Low relevance - consider adding more specific keywords or filters'); } else { lines.push('- ✅ Good relevance score'); } if (result.resultCount < 5) { lines.push('- ⚠️ Few results - may need to broaden search or check query syntax'); } lines.push(''); // Sample results if (result.results && result.results.length > 0) { lines.push('**Sample Results:**'); result.results.slice(0, 3).forEach((r, idx) => { const recencyTag = r.recency && r.recency !== 'unknown' ? `[${r.recency}]` : ''; const relevanceTag = r.relevant ? '✓' : '○'; lines.push(`${idx + 1}. ${relevanceTag} ${r.title} ${recencyTag}`); lines.push(` Domain: ${r.domain}`); lines.push(` ${r.snippet.substring(0, 100)}...`); lines.push(''); }); } } else { lines.push(`**Status:** ❌ Failed`); lines.push(`**Error:** ${result.error || 'No results found'}`); lines.push(''); lines.push('**Recommendations:**'); lines.push('- Check query syntax'); lines.push('- Try broader keywords'); lines.push('- Verify the topic has active discussions'); lines.push(''); } lines.push('---'); lines.push(''); return lines.join('\n'); } /** * Test a batch of queries with delays between each and note-taking */ async function validateBatch(browser, alerts, options = {}) { const { maxAlerts = 5, // Max alerts to test delayBetween = 12000, // Delay between tests (ms) - increased for politeness randomizeOrder = true, // Randomize test order saveNotes = true // Save detailed notes } = options; // Optionally randomize order const alertsToTest = randomizeOrder ? [...alerts].sort(() => Math.random() - 0.5).slice(0, maxAlerts) : alerts.slice(0, maxAlerts); const results = []; const notes = []; notes.push(`# Validation Notes\n`); notes.push(`**Date:** ${new Date().toLocaleString()}`); notes.push(`**Alerts Tested:** ${alertsToTest.length}`); notes.push(`**Delay Between Tests:** ${Math.round(delayBetween / 1000)}s`); notes.push(''); notes.push('---'); notes.push(''); for (let i = 0; i < alertsToTest.length; i++) { const alert = alertsToTest[i]; console.log(`\n${'='.repeat(80)}`); console.log(`Testing ${i + 1}/${alertsToTest.length}: ${alert.name}`); console.log(`${'='.repeat(80)}\n`); try { const result = await validateQuery(browser, alert.query); const enrichedResult = { name: alert.name, ...result }; results.push(enrichedResult); // Add notes for this alert notes.push(createAlertNotes(alert.name, enrichedResult)); // Delay between requests (avoid rate limiting) if (i < alertsToTest.length - 1) { const delay = delayBetween + Math.random() * 3000; // More random variation console.log(`\n⏱️ Waiting ${Math.round(delay / 1000)}s before next test (polite scraping)...\n`); await new Promise(resolve => setTimeout(resolve, delay)); } } catch (error) { console.error(`❌ Failed to test "${alert.name}": ${error.message}`); const failedResult = { name: alert.name, query: alert.query, success: false, error: error.message }; results.push(failedResult); notes.push(createAlertNotes(alert.name, failedResult)); } } return { results, notes: notes.join('\n') }; } /** * Generate validation report with recency and relevance metrics */ function generateReport(results) { const successful = results.filter(r => r.success); const failed = results.filter(r => !r.success); // Calculate aggregate metrics const totalRecent = successful.reduce((sum, r) => sum + (r.recentCount || 0), 0); const totalRelevant = successful.reduce((sum, r) => sum + (r.relevantCount || 0), 0); const avgRecencyScore = successful.length > 0 ? (successful.reduce((sum, r) => sum + (r.avgRecencyScore || 0), 0) / successful.length).toFixed(1) : 0; const avgRelevanceScore = successful.length > 0 ? (successful.reduce((sum, r) => sum + (r.avgRelevanceScore || 0), 0) / successful.length).toFixed(1) : 0; console.log(`\n${'='.repeat(80)}`); console.log(`VALIDATION REPORT`); console.log(`${'='.repeat(80)}\n`); console.log(`📊 Summary:`); console.log(` Total Tested: ${results.length}`); console.log(` ✅ Successful: ${successful.length}`); console.log(` ❌ Failed: ${failed.length}`); console.log(` Success Rate: ${Math.round((successful.length / results.length) * 100)}%`); console.log(` Avg Recency Score: ${avgRecencyScore}/10`); console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`); if (successful.length > 0) { console.log(`✅ Successful Queries:\n`); successful.forEach(r => { const recentTag = r.recentCount > 0 ? `[${r.recentCount} recent]` : ''; const relevantTag = r.relevantCount > 0 ? `[${r.relevantCount} relevant]` : ''; console.log(` • ${r.name} ${recentTag} ${relevantTag}`); console.log(` Results: ${r.resultCount || 0}`); console.log(` Recency: ${(r.avgRecencyScore || 0)}/10`); console.log(` Relevance: ${(r.avgRelevanceScore || 0)}\n`); }); } if (failed.length > 0) { console.log(`❌ Failed Queries:\n`); failed.forEach(r => { console.log(` • ${r.name}`); console.log(` Error: ${r.error || 'No results found'}\n`); }); } // Generate tuning recommendations console.log(`🔧 Tuning Recommendations:\n`); const lowRecency = successful.filter(r => (r.recentCount || 0) === 0); if (lowRecency.length > 0) { console.log(` Alerts with no recent results (${lowRecency.length}):`); lowRecency.forEach(r => console.log(` - ${r.name}`)); console.log(` → Consider broadening keywords or checking topic activity\n`); } const lowRelevance = successful.filter(r => r.relevantCount < (r.resultCount / 2)); if (lowRelevance.length > 0) { console.log(` Alerts with low relevance (${lowRelevance.length}):`); lowRelevance.forEach(r => console.log(` - ${r.name}`)); console.log(` → Add more specific keywords or domain filters\n`); } const fewResults = successful.filter(r => r.resultCount < 5); if (fewResults.length > 0) { console.log(` Alerts with few results (${fewResults.length}):`); fewResults.forEach(r => console.log(` - ${r.name}`)); console.log(` → May need broader search terms\n`); } return { total: results.length, successful: successful.length, failed: failed.length, successRate: (successful.length / results.length) * 100, totalRecent, totalRelevant, avgRecencyScore: parseFloat(avgRecencyScore), avgRelevanceScore: parseFloat(avgRelevanceScore), results }; } /** * Main function */ async function main() { const args = process.argv.slice(2); if (args.length === 0) { console.log(` Usage: node scripts/validate-scraping.js [options] Options: --max N Maximum number of alerts to test (default: 5) --delay MS Delay between tests in ms (default: 5000) --no-randomize Test alerts in order (default: randomized) --headless Run browser in headless mode Examples: node scripts/validate-scraping.js docs/google-alerts-broad.md node scripts/validate-scraping.js docs/google-alerts.md --max 3 --delay 8000 node scripts/validate-scraping.js docs/google-alerts-broad.md --headless `); process.exit(0); } const markdownFile = args[0]; const options = { maxAlerts: 5, delayBetween: 12000, // Increased default for polite scraping randomizeOrder: true, headless: false, saveNotes: true }; // Parse command line options for (let i = 1; i < args.length; i++) { if (args[i] === '--max' && args[i + 1]) { options.maxAlerts = parseInt(args[i + 1]); i++; } else if (args[i] === '--delay' && args[i + 1]) { options.delayBetween = parseInt(args[i + 1]); i++; } else if (args[i] === '--no-randomize') { options.randomizeOrder = false; } else if (args[i] === '--headless') { options.headless = true; } } try { // Parse alerts from markdown console.log(`\n📖 Reading alerts from: ${markdownFile}\n`); const alerts = await parseAlertsFromMarkdown(markdownFile); console.log(`Found ${alerts.length} alerts\n`); if (alerts.length === 0) { console.error('❌ No alerts found in file'); process.exit(1); } // Launch browser with anti-detection args console.log('🚀 Launching browser...\n'); const browser = await chromium.launch({ headless: options.headless, slowMo: 50, args: [ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process' ] }); try { // Validate alerts const { results, notes } = await validateBatch(browser, alerts, options); // Generate report const report = generateReport(results); // Save report to file const timestamp = Date.now(); const reportFile = `validation-report-${timestamp}.json`; const notesFile = `validation-notes-${timestamp}.md`; await writeFile(reportFile, JSON.stringify(report, null, 2)); console.log(`\n💾 JSON report saved to: ${reportFile}`); if (options.saveNotes && notes) { await writeFile(notesFile, notes); console.log(`📝 Detailed notes saved to: ${notesFile}\n`); } } finally { await browser.close(); console.log('✅ Browser closed\n'); } } catch (error) { console.error(`\n❌ Error: ${error.message}\n`); process.exit(1); } } // Run if called directly if (import.meta.url === `file://${process.argv[1]}`) { main().catch(console.error); } // Add missing import import { writeFile } from 'fs/promises'; export { parseAlertsFromMarkdown, validateBatch, generateReport };