rss-feedmonitor/scripts/playwright-scraper.js

454 lines
14 KiB
JavaScript

/**
* Playwright scraper with human-like behavior for Google Alerts validation
* Usage: node scripts/playwright-scraper.js [query]
*/
import { chromium } from 'playwright';
import {
randomDelay,
humanMouseMove,
randomMouseMovements,
humanScroll,
humanClick,
humanType,
humanWaitForLoad,
simulateReading,
getHumanizedContext
} from './human-behavior.js';
/**
* Search Google with a query and validate results
*/
async function searchGoogle(page, query) {
console.log(`\n🔍 Searching Google for: "${query}"\n`);
// Navigate to Google
await page.goto('https://www.google.com', { waitUntil: 'networkidle' });
await randomDelay(1000, 2000);
// Random mouse movements (looking around the page)
await randomMouseMovements(page, 2);
// Find and focus search box
const searchBox = 'textarea[name="q"], input[name="q"]';
await page.waitForSelector(searchBox);
await randomDelay(500, 1000);
// Click search box with human behavior
await humanClick(page, searchBox);
// Type query with realistic timing
await humanType(page, searchBox, query, {
minDelay: 60,
maxDelay: 180,
mistakes: 0.03
});
// Random pause before submitting (reading what we typed)
await randomDelay(500, 1200);
// Submit search (press Enter)
await page.keyboard.press('Enter');
// Wait for results to load
await humanWaitForLoad(page, { minWait: 1500, maxWait: 3000 });
return page;
}
/**
* Extract search results from Google with recency and relevance detection
*/
async function extractResults(page) {
// Scroll to see more results
await humanScroll(page, {
scrollCount: 2,
minScroll: 200,
maxScroll: 500,
minDelay: 800,
maxDelay: 1500,
randomDirection: true
});
// Random mouse movements (scanning results)
await randomMouseMovements(page, 3);
// Extract results with recency and relevance data
const results = await page.evaluate(() => {
const items = [];
// Try multiple selectors for Google search results
const resultElements = document.querySelectorAll('div.g, div[data-sokoban-container], div[data-hveid], div.Gx5Zad');
const seenUrls = new Set(); // Avoid duplicates
resultElements.forEach((element, index) => {
if (items.length >= 20) return; // Limit to first 20 results
const titleElement = element.querySelector('h3');
const linkElement = element.querySelector('a[href]');
const snippetElement = element.querySelector('div[data-sncf]') ||
element.querySelector('div[style*="-webkit-line-clamp"]') ||
element.querySelector('.VwiC3b') ||
element.querySelector('.lyLwlc') ||
element.querySelector('.s') ||
element.querySelector('span:not([class])');
// Try to find date/recency information
const dateElement = element.querySelector('span.MUxGbd') ||
element.querySelector('.f') ||
element.querySelector('.LEwnzc') ||
element.querySelector('span[style*="color"]');
const dateText = dateElement ? dateElement.innerText : '';
if (titleElement && linkElement && linkElement.href) {
const url = linkElement.href;
// Skip non-http links and duplicates
if (!url.startsWith('http') || seenUrls.has(url)) return;
seenUrls.add(url);
try {
const domain = new URL(url).hostname;
items.push({
title: titleElement.innerText,
url: url,
domain: domain,
snippet: snippetElement ? snippetElement.innerText : '',
dateText: dateText
});
} catch (e) {
// Skip invalid URLs
}
}
});
return items;
});
// Analyze recency and relevance
const now = new Date();
results.forEach(result => {
// Detect recency category
const dateText = result.dateText.toLowerCase();
if (dateText.includes('hour') || dateText.includes('minute')) {
result.recency = 'today';
result.recencyScore = 10;
} else if (dateText.includes('day') && !dateText.includes('days ago')) {
result.recency = 'today';
result.recencyScore = 10;
} else if (dateText.match(/\d+\s*day/)) {
const days = parseInt(dateText.match(/(\d+)\s*day/)[1]);
if (days <= 7) {
result.recency = 'this_week';
result.recencyScore = 8;
} else if (days <= 30) {
result.recency = 'this_month';
result.recencyScore = 6;
} else {
result.recency = 'older';
result.recencyScore = 3;
}
} else if (dateText.match(/\d{4}/)) {
// Has a year in the date
result.recency = 'dated';
result.recencyScore = 5;
} else {
result.recency = 'unknown';
result.recencyScore = 0;
}
});
// Get result count
const resultStats = await page.evaluate(() => {
const statsElement = document.querySelector('#result-stats');
return statsElement ? statsElement.innerText : 'Unknown';
});
// Calculate recency distribution
const recencyDist = {
today: results.filter(r => r.recency === 'today').length,
this_week: results.filter(r => r.recency === 'this_week').length,
this_month: results.filter(r => r.recency === 'this_month').length,
older: results.filter(r => r.recency === 'older').length,
unknown: results.filter(r => r.recency === 'unknown').length
};
return { results, stats: resultStats, recencyDist };
}
/**
* Calculate relevance score for results based on query
*/
function calculateRelevance(results, query) {
const queryTerms = query.toLowerCase()
.replace(/['"()]/g, '')
.split(/\s+/)
.filter(t => t.length > 3 && !['site:', 'http', 'https'].some(p => t.includes(p)));
results.forEach(result => {
let relevanceScore = 0;
const titleLower = result.title.toLowerCase();
const snippetLower = result.snippet.toLowerCase();
// Check keyword presence in title (weighted higher)
queryTerms.forEach(term => {
if (titleLower.includes(term)) relevanceScore += 3;
if (snippetLower.includes(term)) relevanceScore += 1;
});
// Check for expected domains (reddit, kijiji, craigslist, etc.)
const targetDomains = ['reddit.com', 'kijiji.ca', 'craigslist', 'facebook.com', 'used.ca'];
if (targetDomains.some(d => result.domain.includes(d))) {
relevanceScore += 2;
}
// Check for repair-related terms
const repairTerms = ['repair', 'fix', 'broken', 'replace', 'service', 'refurbish'];
repairTerms.forEach(term => {
if (titleLower.includes(term) || snippetLower.includes(term)) {
relevanceScore += 1;
}
});
result.relevanceScore = relevanceScore;
result.relevant = relevanceScore >= 3;
});
return results;
}
/**
* Validate a single Google Alert query with recency and relevance analysis
*/
async function validateQuery(browser, query) {
const context = await getHumanizedContext(browser);
const page = await context.newPage();
try {
// Perform search
await searchGoogle(page, query);
// Extract and analyze results
const { results, stats, recencyDist } = await extractResults(page);
// Calculate relevance
calculateRelevance(results, query);
// Calculate metrics
const recentResults = results.filter(r => ['today', 'this_week'].includes(r.recency)).length;
const relevantResults = results.filter(r => r.relevant).length;
const avgRecencyScore = results.length > 0
? (results.reduce((sum, r) => sum + r.recencyScore, 0) / results.length).toFixed(1)
: 0;
const avgRelevanceScore = results.length > 0
? (results.reduce((sum, r) => sum + r.relevanceScore, 0) / results.length).toFixed(1)
: 0;
console.log(`\n📊 Results Summary:`);
console.log(` Stats: ${stats}`);
console.log(` Found: ${results.length} results`);
console.log(` Recent (today/this week): ${recentResults}`);
console.log(` Relevant: ${relevantResults}`);
console.log(` Avg Recency Score: ${avgRecencyScore}/10`);
console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`);
console.log(`📅 Recency Distribution:`);
console.log(` Today: ${recencyDist.today}`);
console.log(` This Week: ${recencyDist.this_week}`);
console.log(` This Month: ${recencyDist.this_month}`);
console.log(` Older: ${recencyDist.older}`);
console.log(` Unknown: ${recencyDist.unknown}\n`);
if (results.length > 0) {
console.log(`✅ Top Results:\n`);
results.slice(0, 5).forEach((result, index) => {
const recencyTag = result.recency !== 'unknown' ? `[${result.recency}]` : '';
const relevanceTag = result.relevant ? '✓' : '○';
console.log(`${index + 1}. ${relevanceTag} ${result.title} ${recencyTag}`);
console.log(` ${result.domain}`);
console.log(` ${result.snippet.substring(0, 100)}...\n`);
});
} else {
console.log(`❌ No results found for this query\n`);
}
// Simulate reading before closing
await simulateReading(page, 3000);
return {
query,
success: results.length > 0,
resultCount: results.length,
recentCount: recentResults,
relevantCount: relevantResults,
avgRecencyScore: parseFloat(avgRecencyScore),
avgRelevanceScore: parseFloat(avgRelevanceScore),
recencyDist,
stats,
results: results.slice(0, 10) // Return first 10
};
} catch (error) {
console.error(`❌ Error validating query: ${error.message}`);
return {
query,
success: false,
error: error.message
};
} finally {
await page.close();
await context.close();
}
}
/**
* Scrape a specific website with human-like behavior
*/
async function scrapeWebsite(browser, url, selectors = {}) {
console.log(`\n🌐 Scraping: ${url}\n`);
const context = await getHumanizedContext(browser);
const page = await context.newPage();
try {
// Navigate to page
await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
await humanWaitForLoad(page, { minWait: 2000, maxWait: 4000 });
// Initial random mouse movements
await randomMouseMovements(page, 2);
// Scroll through page naturally
await humanScroll(page, {
scrollCount: 3,
minScroll: 150,
maxScroll: 400,
minDelay: 1000,
maxDelay: 2500,
randomDirection: true
});
// More random movements
await randomMouseMovements(page, 2);
// Extract content based on selectors
const content = await page.evaluate((sels) => {
const data = {};
// Try to extract title
const titleSelectors = sels.title || ['h1', 'h2', '.title', '#title'];
for (const sel of titleSelectors) {
const el = document.querySelector(sel);
if (el) {
data.title = el.innerText;
break;
}
}
// Try to extract main content
const contentSelectors = sels.content || ['article', 'main', '.content', '#content'];
for (const sel of contentSelectors) {
const el = document.querySelector(sel);
if (el) {
data.content = el.innerText.substring(0, 1000);
break;
}
}
// Extract links
const links = Array.from(document.querySelectorAll('a')).map(a => ({
text: a.innerText.substring(0, 100),
href: a.href
})).slice(0, 20);
data.links = links;
return data;
}, selectors);
console.log(`\n📄 Scraped Content:`);
console.log(` Title: ${content.title || 'N/A'}`);
console.log(` Content Length: ${content.content?.length || 0} chars`);
console.log(` Links Found: ${content.links?.length || 0}\n`);
// Simulate reading/interaction
await simulateReading(page, 4000);
return {
url,
success: true,
content
};
} catch (error) {
console.error(`❌ Error scraping: ${error.message}`);
return {
url,
success: false,
error: error.message
};
} finally {
await page.close();
await context.close();
}
}
/**
* Main function
*/
async function main() {
const args = process.argv.slice(2);
if (args.length === 0) {
console.log(`
Usage:
node scripts/playwright-scraper.js "your search query"
node scripts/playwright-scraper.js --url "https://example.com"
Examples:
node scripts/playwright-scraper.js '"macbook repair" Toronto'
node scripts/playwright-scraper.js --url "https://www.reddit.com/r/toronto"
`);
process.exit(0);
}
// Launch browser with anti-detection args
console.log('🚀 Launching browser...\n');
const browser = await chromium.launch({
headless: false, // Set to true for production
slowMo: 50, // Slight delay between actions (more human-like)
args: [
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'
]
});
try {
if (args[0] === '--url' && args[1]) {
// Scrape a specific URL
const result = await scrapeWebsite(browser, args[1]);
console.log('\n' + JSON.stringify(result, null, 2));
} else {
// Validate a search query
const query = args.join(' ').replace(/^["']|["']$/g, '');
const result = await validateQuery(browser, query);
console.log('\n' + JSON.stringify(result, null, 2));
}
} finally {
await browser.close();
console.log('\n✅ Browser closed\n');
}
}
// Run if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch(console.error);
}
export { validateQuery, scrapeWebsite, searchGoogle, extractResults };