rss-feedmonitor/scripts/validate-scraping.js

423 lines
14 KiB
JavaScript

/**
* Validate multiple Google Alert queries from markdown files
* Uses Playwright with human-like behavior to test queries
*/
import { chromium } from 'playwright';
import { readFile } from 'fs/promises';
import { validateQuery } from './playwright-scraper.js';
/**
* Parse alert queries from markdown file
*/
async function parseAlertsFromMarkdown(filePath) {
const content = await readFile(filePath, 'utf-8');
const lines = content.split('\n');
const alerts = [];
let currentAlert = null;
let inCodeBlock = false;
let queryLines = [];
for (const line of lines) {
// Detect alert name
if (line.startsWith('**Alert Name:**') || line.startsWith('## ')) {
if (currentAlert && queryLines.length > 0) {
currentAlert.query = queryLines.join('\n').trim();
alerts.push(currentAlert);
}
let name = '';
if (line.startsWith('**Alert Name:**')) {
const match = line.match(/`([^`]+)`/);
name = match ? match[1] : line.split('**Alert Name:**')[1].trim();
} else if (line.startsWith('## ')) {
name = line.replace(/^## /, '').trim();
}
currentAlert = { name, query: '' };
queryLines = [];
continue;
}
// Detect code blocks containing queries
if (line.trim() === '```') {
if (!inCodeBlock && currentAlert) {
inCodeBlock = true;
queryLines = [];
} else if (inCodeBlock) {
inCodeBlock = false;
}
continue;
}
// Collect query lines
if (inCodeBlock) {
queryLines.push(line);
}
}
// Add last alert
if (currentAlert && queryLines.length > 0) {
currentAlert.query = queryLines.join('\n').trim();
alerts.push(currentAlert);
}
// Clean up ALERT_NAME markers from queries (they cause false negatives)
alerts.forEach(alert => {
alert.query = alert.query.replace(/-"ALERT_NAME:[^"]*"\s*/g, '');
});
return alerts;
}
/**
* Create detailed notes for a single alert test
*/
function createAlertNotes(alertName, result) {
const lines = [];
const timestamp = new Date().toISOString();
lines.push(`## ${alertName}`);
lines.push(`**Tested:** ${timestamp}`);
lines.push(`**Query:** \`${result.query}\``);
lines.push('');
if (result.success) {
lines.push(`**Status:** ✅ Success`);
lines.push(`**Total Results:** ${result.resultCount}`);
lines.push(`**Recent Results:** ${result.recentCount || 0} (today/this week)`);
lines.push(`**Relevant Results:** ${result.relevantCount || 0}`);
lines.push(`**Avg Recency Score:** ${result.avgRecencyScore || 0}/10`);
lines.push(`**Avg Relevance Score:** ${result.avgRelevanceScore || 0}`);
lines.push('');
if (result.recencyDist) {
lines.push('**Recency Breakdown:**');
lines.push(`- Today: ${result.recencyDist.today}`);
lines.push(`- This Week: ${result.recencyDist.this_week}`);
lines.push(`- This Month: ${result.recencyDist.this_month}`);
lines.push(`- Older: ${result.recencyDist.older}`);
lines.push(`- Unknown: ${result.recencyDist.unknown}`);
lines.push('');
}
// Add tuning recommendations
lines.push('**Analysis:**');
if (result.recentCount === 0) {
lines.push('- ⚠️ No recent results - consider broadening keywords or checking if topic is active');
} else if (result.recentCount >= 3) {
lines.push('- ✅ Good number of recent results');
}
if (result.relevantCount < result.resultCount / 2) {
lines.push('- ⚠️ Low relevance - consider adding more specific keywords or filters');
} else {
lines.push('- ✅ Good relevance score');
}
if (result.resultCount < 5) {
lines.push('- ⚠️ Few results - may need to broaden search or check query syntax');
}
lines.push('');
// Sample results
if (result.results && result.results.length > 0) {
lines.push('**Sample Results:**');
result.results.slice(0, 3).forEach((r, idx) => {
const recencyTag = r.recency && r.recency !== 'unknown' ? `[${r.recency}]` : '';
const relevanceTag = r.relevant ? '✓' : '○';
lines.push(`${idx + 1}. ${relevanceTag} ${r.title} ${recencyTag}`);
lines.push(` Domain: ${r.domain}`);
lines.push(` ${r.snippet.substring(0, 100)}...`);
lines.push('');
});
}
} else {
lines.push(`**Status:** ❌ Failed`);
lines.push(`**Error:** ${result.error || 'No results found'}`);
lines.push('');
lines.push('**Recommendations:**');
lines.push('- Check query syntax');
lines.push('- Try broader keywords');
lines.push('- Verify the topic has active discussions');
lines.push('');
}
lines.push('---');
lines.push('');
return lines.join('\n');
}
/**
* Test a batch of queries with delays between each and note-taking
*/
async function validateBatch(browser, alerts, options = {}) {
const {
maxAlerts = 5, // Max alerts to test
delayBetween = 12000, // Delay between tests (ms) - increased for politeness
randomizeOrder = true, // Randomize test order
saveNotes = true // Save detailed notes
} = options;
// Optionally randomize order
const alertsToTest = randomizeOrder
? [...alerts].sort(() => Math.random() - 0.5).slice(0, maxAlerts)
: alerts.slice(0, maxAlerts);
const results = [];
const notes = [];
notes.push(`# Validation Notes\n`);
notes.push(`**Date:** ${new Date().toLocaleString()}`);
notes.push(`**Alerts Tested:** ${alertsToTest.length}`);
notes.push(`**Delay Between Tests:** ${Math.round(delayBetween / 1000)}s`);
notes.push('');
notes.push('---');
notes.push('');
for (let i = 0; i < alertsToTest.length; i++) {
const alert = alertsToTest[i];
console.log(`\n${'='.repeat(80)}`);
console.log(`Testing ${i + 1}/${alertsToTest.length}: ${alert.name}`);
console.log(`${'='.repeat(80)}\n`);
try {
const result = await validateQuery(browser, alert.query);
const enrichedResult = {
name: alert.name,
...result
};
results.push(enrichedResult);
// Add notes for this alert
notes.push(createAlertNotes(alert.name, enrichedResult));
// Delay between requests (avoid rate limiting)
if (i < alertsToTest.length - 1) {
const delay = delayBetween + Math.random() * 3000; // More random variation
console.log(`\n⏱️ Waiting ${Math.round(delay / 1000)}s before next test (polite scraping)...\n`);
await new Promise(resolve => setTimeout(resolve, delay));
}
} catch (error) {
console.error(`❌ Failed to test "${alert.name}": ${error.message}`);
const failedResult = {
name: alert.name,
query: alert.query,
success: false,
error: error.message
};
results.push(failedResult);
notes.push(createAlertNotes(alert.name, failedResult));
}
}
return { results, notes: notes.join('\n') };
}
/**
* Generate validation report with recency and relevance metrics
*/
function generateReport(results) {
const successful = results.filter(r => r.success);
const failed = results.filter(r => !r.success);
// Calculate aggregate metrics
const totalRecent = successful.reduce((sum, r) => sum + (r.recentCount || 0), 0);
const totalRelevant = successful.reduce((sum, r) => sum + (r.relevantCount || 0), 0);
const avgRecencyScore = successful.length > 0
? (successful.reduce((sum, r) => sum + (r.avgRecencyScore || 0), 0) / successful.length).toFixed(1)
: 0;
const avgRelevanceScore = successful.length > 0
? (successful.reduce((sum, r) => sum + (r.avgRelevanceScore || 0), 0) / successful.length).toFixed(1)
: 0;
console.log(`\n${'='.repeat(80)}`);
console.log(`VALIDATION REPORT`);
console.log(`${'='.repeat(80)}\n`);
console.log(`📊 Summary:`);
console.log(` Total Tested: ${results.length}`);
console.log(` ✅ Successful: ${successful.length}`);
console.log(` ❌ Failed: ${failed.length}`);
console.log(` Success Rate: ${Math.round((successful.length / results.length) * 100)}%`);
console.log(` Avg Recency Score: ${avgRecencyScore}/10`);
console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`);
if (successful.length > 0) {
console.log(`✅ Successful Queries:\n`);
successful.forEach(r => {
const recentTag = r.recentCount > 0 ? `[${r.recentCount} recent]` : '';
const relevantTag = r.relevantCount > 0 ? `[${r.relevantCount} relevant]` : '';
console.log(`${r.name} ${recentTag} ${relevantTag}`);
console.log(` Results: ${r.resultCount || 0}`);
console.log(` Recency: ${(r.avgRecencyScore || 0)}/10`);
console.log(` Relevance: ${(r.avgRelevanceScore || 0)}\n`);
});
}
if (failed.length > 0) {
console.log(`❌ Failed Queries:\n`);
failed.forEach(r => {
console.log(`${r.name}`);
console.log(` Error: ${r.error || 'No results found'}\n`);
});
}
// Generate tuning recommendations
console.log(`🔧 Tuning Recommendations:\n`);
const lowRecency = successful.filter(r => (r.recentCount || 0) === 0);
if (lowRecency.length > 0) {
console.log(` Alerts with no recent results (${lowRecency.length}):`);
lowRecency.forEach(r => console.log(` - ${r.name}`));
console.log(` → Consider broadening keywords or checking topic activity\n`);
}
const lowRelevance = successful.filter(r => r.relevantCount < (r.resultCount / 2));
if (lowRelevance.length > 0) {
console.log(` Alerts with low relevance (${lowRelevance.length}):`);
lowRelevance.forEach(r => console.log(` - ${r.name}`));
console.log(` → Add more specific keywords or domain filters\n`);
}
const fewResults = successful.filter(r => r.resultCount < 5);
if (fewResults.length > 0) {
console.log(` Alerts with few results (${fewResults.length}):`);
fewResults.forEach(r => console.log(` - ${r.name}`));
console.log(` → May need broader search terms\n`);
}
return {
total: results.length,
successful: successful.length,
failed: failed.length,
successRate: (successful.length / results.length) * 100,
totalRecent,
totalRelevant,
avgRecencyScore: parseFloat(avgRecencyScore),
avgRelevanceScore: parseFloat(avgRelevanceScore),
results
};
}
/**
* Main function
*/
async function main() {
const args = process.argv.slice(2);
if (args.length === 0) {
console.log(`
Usage:
node scripts/validate-scraping.js <markdown-file> [options]
Options:
--max N Maximum number of alerts to test (default: 5)
--delay MS Delay between tests in ms (default: 5000)
--no-randomize Test alerts in order (default: randomized)
--headless Run browser in headless mode
Examples:
node scripts/validate-scraping.js docs/google-alerts-broad.md
node scripts/validate-scraping.js docs/google-alerts.md --max 3 --delay 8000
node scripts/validate-scraping.js docs/google-alerts-broad.md --headless
`);
process.exit(0);
}
const markdownFile = args[0];
const options = {
maxAlerts: 5,
delayBetween: 12000, // Increased default for polite scraping
randomizeOrder: true,
headless: false,
saveNotes: true
};
// Parse command line options
for (let i = 1; i < args.length; i++) {
if (args[i] === '--max' && args[i + 1]) {
options.maxAlerts = parseInt(args[i + 1]);
i++;
} else if (args[i] === '--delay' && args[i + 1]) {
options.delayBetween = parseInt(args[i + 1]);
i++;
} else if (args[i] === '--no-randomize') {
options.randomizeOrder = false;
} else if (args[i] === '--headless') {
options.headless = true;
}
}
try {
// Parse alerts from markdown
console.log(`\n📖 Reading alerts from: ${markdownFile}\n`);
const alerts = await parseAlertsFromMarkdown(markdownFile);
console.log(`Found ${alerts.length} alerts\n`);
if (alerts.length === 0) {
console.error('❌ No alerts found in file');
process.exit(1);
}
// Launch browser with anti-detection args
console.log('🚀 Launching browser...\n');
const browser = await chromium.launch({
headless: options.headless,
slowMo: 50,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
});
try {
// Validate alerts
const { results, notes } = await validateBatch(browser, alerts, options);
// Generate report
const report = generateReport(results);
// Save report to file
const timestamp = Date.now();
const reportFile = `validation-report-${timestamp}.json`;
const notesFile = `validation-notes-${timestamp}.md`;
await writeFile(reportFile, JSON.stringify(report, null, 2));
console.log(`\n💾 JSON report saved to: ${reportFile}`);
if (options.saveNotes && notes) {
await writeFile(notesFile, notes);
console.log(`📝 Detailed notes saved to: ${notesFile}\n`);
}
} finally {
await browser.close();
console.log('✅ Browser closed\n');
}
} catch (error) {
console.error(`\n❌ Error: ${error.message}\n`);
process.exit(1);
}
}
// Run if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch(console.error);
}
// Add missing import
import { writeFile } from 'fs/promises';
export { parseAlertsFromMarkdown, validateBatch, generateReport };