/** * Playwright scraper with human-like behavior for Google Alerts validation * Usage: node scripts/playwright-scraper.js [query] */ import { chromium } from 'playwright'; import { randomDelay, humanMouseMove, randomMouseMovements, humanScroll, humanClick, humanType, humanWaitForLoad, simulateReading, getHumanizedContext } from './human-behavior.js'; /** * Search Google with a query and validate results */ async function searchGoogle(page, query) { console.log(`\nšŸ” Searching Google for: "${query}"\n`); // Navigate to Google await page.goto('https://www.google.com', { waitUntil: 'networkidle' }); await randomDelay(1000, 2000); // Random mouse movements (looking around the page) await randomMouseMovements(page, 2); // Find and focus search box const searchBox = 'textarea[name="q"], input[name="q"]'; await page.waitForSelector(searchBox); await randomDelay(500, 1000); // Click search box with human behavior await humanClick(page, searchBox); // Type query with realistic timing await humanType(page, searchBox, query, { minDelay: 60, maxDelay: 180, mistakes: 0.03 }); // Random pause before submitting (reading what we typed) await randomDelay(500, 1200); // Submit search (press Enter) await page.keyboard.press('Enter'); // Wait for results to load await humanWaitForLoad(page, { minWait: 1500, maxWait: 3000 }); return page; } /** * Extract search results from Google with recency and relevance detection */ async function extractResults(page) { // Scroll to see more results await humanScroll(page, { scrollCount: 2, minScroll: 200, maxScroll: 500, minDelay: 800, maxDelay: 1500, randomDirection: true }); // Random mouse movements (scanning results) await randomMouseMovements(page, 3); // Extract results with recency and relevance data const results = await page.evaluate(() => { const items = []; // Try multiple selectors for Google search results const resultElements = document.querySelectorAll('div.g, div[data-sokoban-container], div[data-hveid], div.Gx5Zad'); const seenUrls = new Set(); // Avoid duplicates resultElements.forEach((element, index) => { if (items.length >= 20) return; // Limit to first 20 results const titleElement = element.querySelector('h3'); const linkElement = element.querySelector('a[href]'); const snippetElement = element.querySelector('div[data-sncf]') || element.querySelector('div[style*="-webkit-line-clamp"]') || element.querySelector('.VwiC3b') || element.querySelector('.lyLwlc') || element.querySelector('.s') || element.querySelector('span:not([class])'); // Try to find date/recency information const dateElement = element.querySelector('span.MUxGbd') || element.querySelector('.f') || element.querySelector('.LEwnzc') || element.querySelector('span[style*="color"]'); const dateText = dateElement ? dateElement.innerText : ''; if (titleElement && linkElement && linkElement.href) { const url = linkElement.href; // Skip non-http links and duplicates if (!url.startsWith('http') || seenUrls.has(url)) return; seenUrls.add(url); try { const domain = new URL(url).hostname; items.push({ title: titleElement.innerText, url: url, domain: domain, snippet: snippetElement ? snippetElement.innerText : '', dateText: dateText }); } catch (e) { // Skip invalid URLs } } }); return items; }); // Analyze recency and relevance const now = new Date(); results.forEach(result => { // Detect recency category const dateText = result.dateText.toLowerCase(); if (dateText.includes('hour') || dateText.includes('minute')) { result.recency = 'today'; result.recencyScore = 10; } else if (dateText.includes('day') && !dateText.includes('days ago')) { result.recency = 'today'; result.recencyScore = 10; } else if (dateText.match(/\d+\s*day/)) { const days = parseInt(dateText.match(/(\d+)\s*day/)[1]); if (days <= 7) { result.recency = 'this_week'; result.recencyScore = 8; } else if (days <= 30) { result.recency = 'this_month'; result.recencyScore = 6; } else { result.recency = 'older'; result.recencyScore = 3; } } else if (dateText.match(/\d{4}/)) { // Has a year in the date result.recency = 'dated'; result.recencyScore = 5; } else { result.recency = 'unknown'; result.recencyScore = 0; } }); // Get result count const resultStats = await page.evaluate(() => { const statsElement = document.querySelector('#result-stats'); return statsElement ? statsElement.innerText : 'Unknown'; }); // Calculate recency distribution const recencyDist = { today: results.filter(r => r.recency === 'today').length, this_week: results.filter(r => r.recency === 'this_week').length, this_month: results.filter(r => r.recency === 'this_month').length, older: results.filter(r => r.recency === 'older').length, unknown: results.filter(r => r.recency === 'unknown').length }; return { results, stats: resultStats, recencyDist }; } /** * Calculate relevance score for results based on query */ function calculateRelevance(results, query) { const queryTerms = query.toLowerCase() .replace(/['"()]/g, '') .split(/\s+/) .filter(t => t.length > 3 && !['site:', 'http', 'https'].some(p => t.includes(p))); results.forEach(result => { let relevanceScore = 0; const titleLower = result.title.toLowerCase(); const snippetLower = result.snippet.toLowerCase(); // Check keyword presence in title (weighted higher) queryTerms.forEach(term => { if (titleLower.includes(term)) relevanceScore += 3; if (snippetLower.includes(term)) relevanceScore += 1; }); // Check for expected domains (reddit, kijiji, craigslist, etc.) const targetDomains = ['reddit.com', 'kijiji.ca', 'craigslist', 'facebook.com', 'used.ca']; if (targetDomains.some(d => result.domain.includes(d))) { relevanceScore += 2; } // Check for repair-related terms const repairTerms = ['repair', 'fix', 'broken', 'replace', 'service', 'refurbish']; repairTerms.forEach(term => { if (titleLower.includes(term) || snippetLower.includes(term)) { relevanceScore += 1; } }); result.relevanceScore = relevanceScore; result.relevant = relevanceScore >= 3; }); return results; } /** * Validate a single Google Alert query with recency and relevance analysis */ async function validateQuery(browser, query) { const context = await getHumanizedContext(browser); const page = await context.newPage(); try { // Perform search await searchGoogle(page, query); // Extract and analyze results const { results, stats, recencyDist } = await extractResults(page); // Calculate relevance calculateRelevance(results, query); // Calculate metrics const recentResults = results.filter(r => ['today', 'this_week'].includes(r.recency)).length; const relevantResults = results.filter(r => r.relevant).length; const avgRecencyScore = results.length > 0 ? (results.reduce((sum, r) => sum + r.recencyScore, 0) / results.length).toFixed(1) : 0; const avgRelevanceScore = results.length > 0 ? (results.reduce((sum, r) => sum + r.relevanceScore, 0) / results.length).toFixed(1) : 0; console.log(`\nšŸ“Š Results Summary:`); console.log(` Stats: ${stats}`); console.log(` Found: ${results.length} results`); console.log(` Recent (today/this week): ${recentResults}`); console.log(` Relevant: ${relevantResults}`); console.log(` Avg Recency Score: ${avgRecencyScore}/10`); console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`); console.log(`šŸ“… Recency Distribution:`); console.log(` Today: ${recencyDist.today}`); console.log(` This Week: ${recencyDist.this_week}`); console.log(` This Month: ${recencyDist.this_month}`); console.log(` Older: ${recencyDist.older}`); console.log(` Unknown: ${recencyDist.unknown}\n`); if (results.length > 0) { console.log(`āœ… Top Results:\n`); results.slice(0, 5).forEach((result, index) => { const recencyTag = result.recency !== 'unknown' ? `[${result.recency}]` : ''; const relevanceTag = result.relevant ? 'āœ“' : 'ā—‹'; console.log(`${index + 1}. ${relevanceTag} ${result.title} ${recencyTag}`); console.log(` ${result.domain}`); console.log(` ${result.snippet.substring(0, 100)}...\n`); }); } else { console.log(`āŒ No results found for this query\n`); } // Simulate reading before closing await simulateReading(page, 3000); return { query, success: results.length > 0, resultCount: results.length, recentCount: recentResults, relevantCount: relevantResults, avgRecencyScore: parseFloat(avgRecencyScore), avgRelevanceScore: parseFloat(avgRelevanceScore), recencyDist, stats, results: results.slice(0, 10) // Return first 10 }; } catch (error) { console.error(`āŒ Error validating query: ${error.message}`); return { query, success: false, error: error.message }; } finally { await page.close(); await context.close(); } } /** * Scrape a specific website with human-like behavior */ async function scrapeWebsite(browser, url, selectors = {}) { console.log(`\n🌐 Scraping: ${url}\n`); const context = await getHumanizedContext(browser); const page = await context.newPage(); try { // Navigate to page await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 }); await humanWaitForLoad(page, { minWait: 2000, maxWait: 4000 }); // Initial random mouse movements await randomMouseMovements(page, 2); // Scroll through page naturally await humanScroll(page, { scrollCount: 3, minScroll: 150, maxScroll: 400, minDelay: 1000, maxDelay: 2500, randomDirection: true }); // More random movements await randomMouseMovements(page, 2); // Extract content based on selectors const content = await page.evaluate((sels) => { const data = {}; // Try to extract title const titleSelectors = sels.title || ['h1', 'h2', '.title', '#title']; for (const sel of titleSelectors) { const el = document.querySelector(sel); if (el) { data.title = el.innerText; break; } } // Try to extract main content const contentSelectors = sels.content || ['article', 'main', '.content', '#content']; for (const sel of contentSelectors) { const el = document.querySelector(sel); if (el) { data.content = el.innerText.substring(0, 1000); break; } } // Extract links const links = Array.from(document.querySelectorAll('a')).map(a => ({ text: a.innerText.substring(0, 100), href: a.href })).slice(0, 20); data.links = links; return data; }, selectors); console.log(`\nšŸ“„ Scraped Content:`); console.log(` Title: ${content.title || 'N/A'}`); console.log(` Content Length: ${content.content?.length || 0} chars`); console.log(` Links Found: ${content.links?.length || 0}\n`); // Simulate reading/interaction await simulateReading(page, 4000); return { url, success: true, content }; } catch (error) { console.error(`āŒ Error scraping: ${error.message}`); return { url, success: false, error: error.message }; } finally { await page.close(); await context.close(); } } /** * Main function */ async function main() { const args = process.argv.slice(2); if (args.length === 0) { console.log(` Usage: node scripts/playwright-scraper.js "your search query" node scripts/playwright-scraper.js --url "https://example.com" Examples: node scripts/playwright-scraper.js '"macbook repair" Toronto' node scripts/playwright-scraper.js --url "https://www.reddit.com/r/toronto" `); process.exit(0); } // Launch browser with anti-detection args console.log('šŸš€ Launching browser...\n'); const browser = await chromium.launch({ headless: false, // Set to true for production slowMo: 50, // Slight delay between actions (more human-like) args: [ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process' ] }); try { if (args[0] === '--url' && args[1]) { // Scrape a specific URL const result = await scrapeWebsite(browser, args[1]); console.log('\n' + JSON.stringify(result, null, 2)); } else { // Validate a search query const query = args.join(' ').replace(/^["']|["']$/g, ''); const result = await validateQuery(browser, query); console.log('\n' + JSON.stringify(result, null, 2)); } } finally { await browser.close(); console.log('\nāœ… Browser closed\n'); } } // Run if called directly if (import.meta.url === `file://${process.argv[1]}`) { main().catch(console.error); } export { validateQuery, scrapeWebsite, searchGoogle, extractResults };