423 lines
14 KiB
JavaScript
423 lines
14 KiB
JavaScript
/**
|
|
* Validate multiple Google Alert queries from markdown files
|
|
* Uses Playwright with human-like behavior to test queries
|
|
*/
|
|
|
|
import { chromium } from 'playwright';
|
|
import { readFile } from 'fs/promises';
|
|
import { validateQuery } from './playwright-scraper.js';
|
|
|
|
/**
|
|
* Parse alert queries from markdown file
|
|
*/
|
|
async function parseAlertsFromMarkdown(filePath) {
|
|
const content = await readFile(filePath, 'utf-8');
|
|
const lines = content.split('\n');
|
|
|
|
const alerts = [];
|
|
let currentAlert = null;
|
|
let inCodeBlock = false;
|
|
let queryLines = [];
|
|
|
|
for (const line of lines) {
|
|
// Detect alert name
|
|
if (line.startsWith('**Alert Name:**') || line.startsWith('## ')) {
|
|
if (currentAlert && queryLines.length > 0) {
|
|
currentAlert.query = queryLines.join('\n').trim();
|
|
alerts.push(currentAlert);
|
|
}
|
|
|
|
let name = '';
|
|
if (line.startsWith('**Alert Name:**')) {
|
|
const match = line.match(/`([^`]+)`/);
|
|
name = match ? match[1] : line.split('**Alert Name:**')[1].trim();
|
|
} else if (line.startsWith('## ')) {
|
|
name = line.replace(/^## /, '').trim();
|
|
}
|
|
|
|
currentAlert = { name, query: '' };
|
|
queryLines = [];
|
|
continue;
|
|
}
|
|
|
|
// Detect code blocks containing queries
|
|
if (line.trim() === '```') {
|
|
if (!inCodeBlock && currentAlert) {
|
|
inCodeBlock = true;
|
|
queryLines = [];
|
|
} else if (inCodeBlock) {
|
|
inCodeBlock = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Collect query lines
|
|
if (inCodeBlock) {
|
|
queryLines.push(line);
|
|
}
|
|
}
|
|
|
|
// Add last alert
|
|
if (currentAlert && queryLines.length > 0) {
|
|
currentAlert.query = queryLines.join('\n').trim();
|
|
alerts.push(currentAlert);
|
|
}
|
|
|
|
// Clean up ALERT_NAME markers from queries (they cause false negatives)
|
|
alerts.forEach(alert => {
|
|
alert.query = alert.query.replace(/-"ALERT_NAME:[^"]*"\s*/g, '');
|
|
});
|
|
|
|
return alerts;
|
|
}
|
|
|
|
/**
|
|
* Create detailed notes for a single alert test
|
|
*/
|
|
function createAlertNotes(alertName, result) {
|
|
const lines = [];
|
|
const timestamp = new Date().toISOString();
|
|
|
|
lines.push(`## ${alertName}`);
|
|
lines.push(`**Tested:** ${timestamp}`);
|
|
lines.push(`**Query:** \`${result.query}\``);
|
|
lines.push('');
|
|
|
|
if (result.success) {
|
|
lines.push(`**Status:** ✅ Success`);
|
|
lines.push(`**Total Results:** ${result.resultCount}`);
|
|
lines.push(`**Recent Results:** ${result.recentCount || 0} (today/this week)`);
|
|
lines.push(`**Relevant Results:** ${result.relevantCount || 0}`);
|
|
lines.push(`**Avg Recency Score:** ${result.avgRecencyScore || 0}/10`);
|
|
lines.push(`**Avg Relevance Score:** ${result.avgRelevanceScore || 0}`);
|
|
lines.push('');
|
|
|
|
if (result.recencyDist) {
|
|
lines.push('**Recency Breakdown:**');
|
|
lines.push(`- Today: ${result.recencyDist.today}`);
|
|
lines.push(`- This Week: ${result.recencyDist.this_week}`);
|
|
lines.push(`- This Month: ${result.recencyDist.this_month}`);
|
|
lines.push(`- Older: ${result.recencyDist.older}`);
|
|
lines.push(`- Unknown: ${result.recencyDist.unknown}`);
|
|
lines.push('');
|
|
}
|
|
|
|
// Add tuning recommendations
|
|
lines.push('**Analysis:**');
|
|
if (result.recentCount === 0) {
|
|
lines.push('- ⚠️ No recent results - consider broadening keywords or checking if topic is active');
|
|
} else if (result.recentCount >= 3) {
|
|
lines.push('- ✅ Good number of recent results');
|
|
}
|
|
|
|
if (result.relevantCount < result.resultCount / 2) {
|
|
lines.push('- ⚠️ Low relevance - consider adding more specific keywords or filters');
|
|
} else {
|
|
lines.push('- ✅ Good relevance score');
|
|
}
|
|
|
|
if (result.resultCount < 5) {
|
|
lines.push('- ⚠️ Few results - may need to broaden search or check query syntax');
|
|
}
|
|
|
|
lines.push('');
|
|
|
|
// Sample results
|
|
if (result.results && result.results.length > 0) {
|
|
lines.push('**Sample Results:**');
|
|
result.results.slice(0, 3).forEach((r, idx) => {
|
|
const recencyTag = r.recency && r.recency !== 'unknown' ? `[${r.recency}]` : '';
|
|
const relevanceTag = r.relevant ? '✓' : '○';
|
|
lines.push(`${idx + 1}. ${relevanceTag} ${r.title} ${recencyTag}`);
|
|
lines.push(` Domain: ${r.domain}`);
|
|
lines.push(` ${r.snippet.substring(0, 100)}...`);
|
|
lines.push('');
|
|
});
|
|
}
|
|
} else {
|
|
lines.push(`**Status:** ❌ Failed`);
|
|
lines.push(`**Error:** ${result.error || 'No results found'}`);
|
|
lines.push('');
|
|
lines.push('**Recommendations:**');
|
|
lines.push('- Check query syntax');
|
|
lines.push('- Try broader keywords');
|
|
lines.push('- Verify the topic has active discussions');
|
|
lines.push('');
|
|
}
|
|
|
|
lines.push('---');
|
|
lines.push('');
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
/**
|
|
* Test a batch of queries with delays between each and note-taking
|
|
*/
|
|
async function validateBatch(browser, alerts, options = {}) {
|
|
const {
|
|
maxAlerts = 5, // Max alerts to test
|
|
delayBetween = 12000, // Delay between tests (ms) - increased for politeness
|
|
randomizeOrder = true, // Randomize test order
|
|
saveNotes = true // Save detailed notes
|
|
} = options;
|
|
|
|
// Optionally randomize order
|
|
const alertsToTest = randomizeOrder
|
|
? [...alerts].sort(() => Math.random() - 0.5).slice(0, maxAlerts)
|
|
: alerts.slice(0, maxAlerts);
|
|
|
|
const results = [];
|
|
const notes = [];
|
|
|
|
notes.push(`# Validation Notes\n`);
|
|
notes.push(`**Date:** ${new Date().toLocaleString()}`);
|
|
notes.push(`**Alerts Tested:** ${alertsToTest.length}`);
|
|
notes.push(`**Delay Between Tests:** ${Math.round(delayBetween / 1000)}s`);
|
|
notes.push('');
|
|
notes.push('---');
|
|
notes.push('');
|
|
|
|
for (let i = 0; i < alertsToTest.length; i++) {
|
|
const alert = alertsToTest[i];
|
|
|
|
console.log(`\n${'='.repeat(80)}`);
|
|
console.log(`Testing ${i + 1}/${alertsToTest.length}: ${alert.name}`);
|
|
console.log(`${'='.repeat(80)}\n`);
|
|
|
|
try {
|
|
const result = await validateQuery(browser, alert.query);
|
|
const enrichedResult = {
|
|
name: alert.name,
|
|
...result
|
|
};
|
|
results.push(enrichedResult);
|
|
|
|
// Add notes for this alert
|
|
notes.push(createAlertNotes(alert.name, enrichedResult));
|
|
|
|
// Delay between requests (avoid rate limiting)
|
|
if (i < alertsToTest.length - 1) {
|
|
const delay = delayBetween + Math.random() * 3000; // More random variation
|
|
console.log(`\n⏱️ Waiting ${Math.round(delay / 1000)}s before next test (polite scraping)...\n`);
|
|
await new Promise(resolve => setTimeout(resolve, delay));
|
|
}
|
|
} catch (error) {
|
|
console.error(`❌ Failed to test "${alert.name}": ${error.message}`);
|
|
const failedResult = {
|
|
name: alert.name,
|
|
query: alert.query,
|
|
success: false,
|
|
error: error.message
|
|
};
|
|
results.push(failedResult);
|
|
notes.push(createAlertNotes(alert.name, failedResult));
|
|
}
|
|
}
|
|
|
|
return { results, notes: notes.join('\n') };
|
|
}
|
|
|
|
/**
|
|
* Generate validation report with recency and relevance metrics
|
|
*/
|
|
function generateReport(results) {
|
|
const successful = results.filter(r => r.success);
|
|
const failed = results.filter(r => !r.success);
|
|
|
|
// Calculate aggregate metrics
|
|
const totalRecent = successful.reduce((sum, r) => sum + (r.recentCount || 0), 0);
|
|
const totalRelevant = successful.reduce((sum, r) => sum + (r.relevantCount || 0), 0);
|
|
const avgRecencyScore = successful.length > 0
|
|
? (successful.reduce((sum, r) => sum + (r.avgRecencyScore || 0), 0) / successful.length).toFixed(1)
|
|
: 0;
|
|
const avgRelevanceScore = successful.length > 0
|
|
? (successful.reduce((sum, r) => sum + (r.avgRelevanceScore || 0), 0) / successful.length).toFixed(1)
|
|
: 0;
|
|
|
|
console.log(`\n${'='.repeat(80)}`);
|
|
console.log(`VALIDATION REPORT`);
|
|
console.log(`${'='.repeat(80)}\n`);
|
|
|
|
console.log(`📊 Summary:`);
|
|
console.log(` Total Tested: ${results.length}`);
|
|
console.log(` ✅ Successful: ${successful.length}`);
|
|
console.log(` ❌ Failed: ${failed.length}`);
|
|
console.log(` Success Rate: ${Math.round((successful.length / results.length) * 100)}%`);
|
|
console.log(` Avg Recency Score: ${avgRecencyScore}/10`);
|
|
console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`);
|
|
|
|
if (successful.length > 0) {
|
|
console.log(`✅ Successful Queries:\n`);
|
|
successful.forEach(r => {
|
|
const recentTag = r.recentCount > 0 ? `[${r.recentCount} recent]` : '';
|
|
const relevantTag = r.relevantCount > 0 ? `[${r.relevantCount} relevant]` : '';
|
|
console.log(` • ${r.name} ${recentTag} ${relevantTag}`);
|
|
console.log(` Results: ${r.resultCount || 0}`);
|
|
console.log(` Recency: ${(r.avgRecencyScore || 0)}/10`);
|
|
console.log(` Relevance: ${(r.avgRelevanceScore || 0)}\n`);
|
|
});
|
|
}
|
|
|
|
if (failed.length > 0) {
|
|
console.log(`❌ Failed Queries:\n`);
|
|
failed.forEach(r => {
|
|
console.log(` • ${r.name}`);
|
|
console.log(` Error: ${r.error || 'No results found'}\n`);
|
|
});
|
|
}
|
|
|
|
// Generate tuning recommendations
|
|
console.log(`🔧 Tuning Recommendations:\n`);
|
|
|
|
const lowRecency = successful.filter(r => (r.recentCount || 0) === 0);
|
|
if (lowRecency.length > 0) {
|
|
console.log(` Alerts with no recent results (${lowRecency.length}):`);
|
|
lowRecency.forEach(r => console.log(` - ${r.name}`));
|
|
console.log(` → Consider broadening keywords or checking topic activity\n`);
|
|
}
|
|
|
|
const lowRelevance = successful.filter(r => r.relevantCount < (r.resultCount / 2));
|
|
if (lowRelevance.length > 0) {
|
|
console.log(` Alerts with low relevance (${lowRelevance.length}):`);
|
|
lowRelevance.forEach(r => console.log(` - ${r.name}`));
|
|
console.log(` → Add more specific keywords or domain filters\n`);
|
|
}
|
|
|
|
const fewResults = successful.filter(r => r.resultCount < 5);
|
|
if (fewResults.length > 0) {
|
|
console.log(` Alerts with few results (${fewResults.length}):`);
|
|
fewResults.forEach(r => console.log(` - ${r.name}`));
|
|
console.log(` → May need broader search terms\n`);
|
|
}
|
|
|
|
return {
|
|
total: results.length,
|
|
successful: successful.length,
|
|
failed: failed.length,
|
|
successRate: (successful.length / results.length) * 100,
|
|
totalRecent,
|
|
totalRelevant,
|
|
avgRecencyScore: parseFloat(avgRecencyScore),
|
|
avgRelevanceScore: parseFloat(avgRelevanceScore),
|
|
results
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Main function
|
|
*/
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.length === 0) {
|
|
console.log(`
|
|
Usage:
|
|
node scripts/validate-scraping.js <markdown-file> [options]
|
|
|
|
Options:
|
|
--max N Maximum number of alerts to test (default: 5)
|
|
--delay MS Delay between tests in ms (default: 5000)
|
|
--no-randomize Test alerts in order (default: randomized)
|
|
--headless Run browser in headless mode
|
|
|
|
Examples:
|
|
node scripts/validate-scraping.js docs/google-alerts-broad.md
|
|
node scripts/validate-scraping.js docs/google-alerts.md --max 3 --delay 8000
|
|
node scripts/validate-scraping.js docs/google-alerts-broad.md --headless
|
|
`);
|
|
process.exit(0);
|
|
}
|
|
|
|
const markdownFile = args[0];
|
|
const options = {
|
|
maxAlerts: 5,
|
|
delayBetween: 12000, // Increased default for polite scraping
|
|
randomizeOrder: true,
|
|
headless: false,
|
|
saveNotes: true
|
|
};
|
|
|
|
// Parse command line options
|
|
for (let i = 1; i < args.length; i++) {
|
|
if (args[i] === '--max' && args[i + 1]) {
|
|
options.maxAlerts = parseInt(args[i + 1]);
|
|
i++;
|
|
} else if (args[i] === '--delay' && args[i + 1]) {
|
|
options.delayBetween = parseInt(args[i + 1]);
|
|
i++;
|
|
} else if (args[i] === '--no-randomize') {
|
|
options.randomizeOrder = false;
|
|
} else if (args[i] === '--headless') {
|
|
options.headless = true;
|
|
}
|
|
}
|
|
|
|
try {
|
|
// Parse alerts from markdown
|
|
console.log(`\n📖 Reading alerts from: ${markdownFile}\n`);
|
|
const alerts = await parseAlertsFromMarkdown(markdownFile);
|
|
console.log(`Found ${alerts.length} alerts\n`);
|
|
|
|
if (alerts.length === 0) {
|
|
console.error('❌ No alerts found in file');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Launch browser with anti-detection args
|
|
console.log('🚀 Launching browser...\n');
|
|
const browser = await chromium.launch({
|
|
headless: options.headless,
|
|
slowMo: 50,
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process'
|
|
]
|
|
});
|
|
|
|
try {
|
|
// Validate alerts
|
|
const { results, notes } = await validateBatch(browser, alerts, options);
|
|
|
|
// Generate report
|
|
const report = generateReport(results);
|
|
|
|
// Save report to file
|
|
const timestamp = Date.now();
|
|
const reportFile = `validation-report-${timestamp}.json`;
|
|
const notesFile = `validation-notes-${timestamp}.md`;
|
|
|
|
await writeFile(reportFile, JSON.stringify(report, null, 2));
|
|
console.log(`\n💾 JSON report saved to: ${reportFile}`);
|
|
|
|
if (options.saveNotes && notes) {
|
|
await writeFile(notesFile, notes);
|
|
console.log(`📝 Detailed notes saved to: ${notesFile}\n`);
|
|
}
|
|
|
|
} finally {
|
|
await browser.close();
|
|
console.log('✅ Browser closed\n');
|
|
}
|
|
|
|
} catch (error) {
|
|
console.error(`\n❌ Error: ${error.message}\n`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Run if called directly
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main().catch(console.error);
|
|
}
|
|
|
|
// Add missing import
|
|
import { writeFile } from 'fs/promises';
|
|
|
|
export { parseAlertsFromMarkdown, validateBatch, generateReport };
|
|
|