454 lines
14 KiB
JavaScript
454 lines
14 KiB
JavaScript
/**
|
|
* Playwright scraper with human-like behavior for Google Alerts validation
|
|
* Usage: node scripts/playwright-scraper.js [query]
|
|
*/
|
|
|
|
import { chromium } from 'playwright';
|
|
import {
|
|
randomDelay,
|
|
humanMouseMove,
|
|
randomMouseMovements,
|
|
humanScroll,
|
|
humanClick,
|
|
humanType,
|
|
humanWaitForLoad,
|
|
simulateReading,
|
|
getHumanizedContext
|
|
} from './human-behavior.js';
|
|
|
|
/**
|
|
* Search Google with a query and validate results
|
|
*/
|
|
async function searchGoogle(page, query) {
|
|
console.log(`\n🔍 Searching Google for: "${query}"\n`);
|
|
|
|
// Navigate to Google
|
|
await page.goto('https://www.google.com', { waitUntil: 'networkidle' });
|
|
await randomDelay(1000, 2000);
|
|
|
|
// Random mouse movements (looking around the page)
|
|
await randomMouseMovements(page, 2);
|
|
|
|
// Find and focus search box
|
|
const searchBox = 'textarea[name="q"], input[name="q"]';
|
|
await page.waitForSelector(searchBox);
|
|
await randomDelay(500, 1000);
|
|
|
|
// Click search box with human behavior
|
|
await humanClick(page, searchBox);
|
|
|
|
// Type query with realistic timing
|
|
await humanType(page, searchBox, query, {
|
|
minDelay: 60,
|
|
maxDelay: 180,
|
|
mistakes: 0.03
|
|
});
|
|
|
|
// Random pause before submitting (reading what we typed)
|
|
await randomDelay(500, 1200);
|
|
|
|
// Submit search (press Enter)
|
|
await page.keyboard.press('Enter');
|
|
|
|
// Wait for results to load
|
|
await humanWaitForLoad(page, { minWait: 1500, maxWait: 3000 });
|
|
|
|
return page;
|
|
}
|
|
|
|
/**
|
|
* Extract search results from Google with recency and relevance detection
|
|
*/
|
|
async function extractResults(page) {
|
|
// Scroll to see more results
|
|
await humanScroll(page, {
|
|
scrollCount: 2,
|
|
minScroll: 200,
|
|
maxScroll: 500,
|
|
minDelay: 800,
|
|
maxDelay: 1500,
|
|
randomDirection: true
|
|
});
|
|
|
|
// Random mouse movements (scanning results)
|
|
await randomMouseMovements(page, 3);
|
|
|
|
// Extract results with recency and relevance data
|
|
const results = await page.evaluate(() => {
|
|
const items = [];
|
|
// Try multiple selectors for Google search results
|
|
const resultElements = document.querySelectorAll('div.g, div[data-sokoban-container], div[data-hveid], div.Gx5Zad');
|
|
|
|
const seenUrls = new Set(); // Avoid duplicates
|
|
|
|
resultElements.forEach((element, index) => {
|
|
if (items.length >= 20) return; // Limit to first 20 results
|
|
|
|
const titleElement = element.querySelector('h3');
|
|
const linkElement = element.querySelector('a[href]');
|
|
const snippetElement = element.querySelector('div[data-sncf]') ||
|
|
element.querySelector('div[style*="-webkit-line-clamp"]') ||
|
|
element.querySelector('.VwiC3b') ||
|
|
element.querySelector('.lyLwlc') ||
|
|
element.querySelector('.s') ||
|
|
element.querySelector('span:not([class])');
|
|
|
|
// Try to find date/recency information
|
|
const dateElement = element.querySelector('span.MUxGbd') ||
|
|
element.querySelector('.f') ||
|
|
element.querySelector('.LEwnzc') ||
|
|
element.querySelector('span[style*="color"]');
|
|
const dateText = dateElement ? dateElement.innerText : '';
|
|
|
|
if (titleElement && linkElement && linkElement.href) {
|
|
const url = linkElement.href;
|
|
|
|
// Skip non-http links and duplicates
|
|
if (!url.startsWith('http') || seenUrls.has(url)) return;
|
|
seenUrls.add(url);
|
|
|
|
try {
|
|
const domain = new URL(url).hostname;
|
|
|
|
items.push({
|
|
title: titleElement.innerText,
|
|
url: url,
|
|
domain: domain,
|
|
snippet: snippetElement ? snippetElement.innerText : '',
|
|
dateText: dateText
|
|
});
|
|
} catch (e) {
|
|
// Skip invalid URLs
|
|
}
|
|
}
|
|
});
|
|
|
|
return items;
|
|
});
|
|
|
|
// Analyze recency and relevance
|
|
const now = new Date();
|
|
results.forEach(result => {
|
|
// Detect recency category
|
|
const dateText = result.dateText.toLowerCase();
|
|
if (dateText.includes('hour') || dateText.includes('minute')) {
|
|
result.recency = 'today';
|
|
result.recencyScore = 10;
|
|
} else if (dateText.includes('day') && !dateText.includes('days ago')) {
|
|
result.recency = 'today';
|
|
result.recencyScore = 10;
|
|
} else if (dateText.match(/\d+\s*day/)) {
|
|
const days = parseInt(dateText.match(/(\d+)\s*day/)[1]);
|
|
if (days <= 7) {
|
|
result.recency = 'this_week';
|
|
result.recencyScore = 8;
|
|
} else if (days <= 30) {
|
|
result.recency = 'this_month';
|
|
result.recencyScore = 6;
|
|
} else {
|
|
result.recency = 'older';
|
|
result.recencyScore = 3;
|
|
}
|
|
} else if (dateText.match(/\d{4}/)) {
|
|
// Has a year in the date
|
|
result.recency = 'dated';
|
|
result.recencyScore = 5;
|
|
} else {
|
|
result.recency = 'unknown';
|
|
result.recencyScore = 0;
|
|
}
|
|
});
|
|
|
|
// Get result count
|
|
const resultStats = await page.evaluate(() => {
|
|
const statsElement = document.querySelector('#result-stats');
|
|
return statsElement ? statsElement.innerText : 'Unknown';
|
|
});
|
|
|
|
// Calculate recency distribution
|
|
const recencyDist = {
|
|
today: results.filter(r => r.recency === 'today').length,
|
|
this_week: results.filter(r => r.recency === 'this_week').length,
|
|
this_month: results.filter(r => r.recency === 'this_month').length,
|
|
older: results.filter(r => r.recency === 'older').length,
|
|
unknown: results.filter(r => r.recency === 'unknown').length
|
|
};
|
|
|
|
return { results, stats: resultStats, recencyDist };
|
|
}
|
|
|
|
/**
|
|
* Calculate relevance score for results based on query
|
|
*/
|
|
function calculateRelevance(results, query) {
|
|
const queryTerms = query.toLowerCase()
|
|
.replace(/['"()]/g, '')
|
|
.split(/\s+/)
|
|
.filter(t => t.length > 3 && !['site:', 'http', 'https'].some(p => t.includes(p)));
|
|
|
|
results.forEach(result => {
|
|
let relevanceScore = 0;
|
|
const titleLower = result.title.toLowerCase();
|
|
const snippetLower = result.snippet.toLowerCase();
|
|
|
|
// Check keyword presence in title (weighted higher)
|
|
queryTerms.forEach(term => {
|
|
if (titleLower.includes(term)) relevanceScore += 3;
|
|
if (snippetLower.includes(term)) relevanceScore += 1;
|
|
});
|
|
|
|
// Check for expected domains (reddit, kijiji, craigslist, etc.)
|
|
const targetDomains = ['reddit.com', 'kijiji.ca', 'craigslist', 'facebook.com', 'used.ca'];
|
|
if (targetDomains.some(d => result.domain.includes(d))) {
|
|
relevanceScore += 2;
|
|
}
|
|
|
|
// Check for repair-related terms
|
|
const repairTerms = ['repair', 'fix', 'broken', 'replace', 'service', 'refurbish'];
|
|
repairTerms.forEach(term => {
|
|
if (titleLower.includes(term) || snippetLower.includes(term)) {
|
|
relevanceScore += 1;
|
|
}
|
|
});
|
|
|
|
result.relevanceScore = relevanceScore;
|
|
result.relevant = relevanceScore >= 3;
|
|
});
|
|
|
|
return results;
|
|
}
|
|
|
|
/**
|
|
* Validate a single Google Alert query with recency and relevance analysis
|
|
*/
|
|
async function validateQuery(browser, query) {
|
|
const context = await getHumanizedContext(browser);
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
// Perform search
|
|
await searchGoogle(page, query);
|
|
|
|
// Extract and analyze results
|
|
const { results, stats, recencyDist } = await extractResults(page);
|
|
|
|
// Calculate relevance
|
|
calculateRelevance(results, query);
|
|
|
|
// Calculate metrics
|
|
const recentResults = results.filter(r => ['today', 'this_week'].includes(r.recency)).length;
|
|
const relevantResults = results.filter(r => r.relevant).length;
|
|
const avgRecencyScore = results.length > 0
|
|
? (results.reduce((sum, r) => sum + r.recencyScore, 0) / results.length).toFixed(1)
|
|
: 0;
|
|
const avgRelevanceScore = results.length > 0
|
|
? (results.reduce((sum, r) => sum + r.relevanceScore, 0) / results.length).toFixed(1)
|
|
: 0;
|
|
|
|
console.log(`\n📊 Results Summary:`);
|
|
console.log(` Stats: ${stats}`);
|
|
console.log(` Found: ${results.length} results`);
|
|
console.log(` Recent (today/this week): ${recentResults}`);
|
|
console.log(` Relevant: ${relevantResults}`);
|
|
console.log(` Avg Recency Score: ${avgRecencyScore}/10`);
|
|
console.log(` Avg Relevance Score: ${avgRelevanceScore}\n`);
|
|
|
|
console.log(`📅 Recency Distribution:`);
|
|
console.log(` Today: ${recencyDist.today}`);
|
|
console.log(` This Week: ${recencyDist.this_week}`);
|
|
console.log(` This Month: ${recencyDist.this_month}`);
|
|
console.log(` Older: ${recencyDist.older}`);
|
|
console.log(` Unknown: ${recencyDist.unknown}\n`);
|
|
|
|
if (results.length > 0) {
|
|
console.log(`✅ Top Results:\n`);
|
|
results.slice(0, 5).forEach((result, index) => {
|
|
const recencyTag = result.recency !== 'unknown' ? `[${result.recency}]` : '';
|
|
const relevanceTag = result.relevant ? '✓' : '○';
|
|
console.log(`${index + 1}. ${relevanceTag} ${result.title} ${recencyTag}`);
|
|
console.log(` ${result.domain}`);
|
|
console.log(` ${result.snippet.substring(0, 100)}...\n`);
|
|
});
|
|
} else {
|
|
console.log(`❌ No results found for this query\n`);
|
|
}
|
|
|
|
// Simulate reading before closing
|
|
await simulateReading(page, 3000);
|
|
|
|
return {
|
|
query,
|
|
success: results.length > 0,
|
|
resultCount: results.length,
|
|
recentCount: recentResults,
|
|
relevantCount: relevantResults,
|
|
avgRecencyScore: parseFloat(avgRecencyScore),
|
|
avgRelevanceScore: parseFloat(avgRelevanceScore),
|
|
recencyDist,
|
|
stats,
|
|
results: results.slice(0, 10) // Return first 10
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error(`❌ Error validating query: ${error.message}`);
|
|
return {
|
|
query,
|
|
success: false,
|
|
error: error.message
|
|
};
|
|
} finally {
|
|
await page.close();
|
|
await context.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scrape a specific website with human-like behavior
|
|
*/
|
|
async function scrapeWebsite(browser, url, selectors = {}) {
|
|
console.log(`\n🌐 Scraping: ${url}\n`);
|
|
|
|
const context = await getHumanizedContext(browser);
|
|
const page = await context.newPage();
|
|
|
|
try {
|
|
// Navigate to page
|
|
await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 });
|
|
await humanWaitForLoad(page, { minWait: 2000, maxWait: 4000 });
|
|
|
|
// Initial random mouse movements
|
|
await randomMouseMovements(page, 2);
|
|
|
|
// Scroll through page naturally
|
|
await humanScroll(page, {
|
|
scrollCount: 3,
|
|
minScroll: 150,
|
|
maxScroll: 400,
|
|
minDelay: 1000,
|
|
maxDelay: 2500,
|
|
randomDirection: true
|
|
});
|
|
|
|
// More random movements
|
|
await randomMouseMovements(page, 2);
|
|
|
|
// Extract content based on selectors
|
|
const content = await page.evaluate((sels) => {
|
|
const data = {};
|
|
|
|
// Try to extract title
|
|
const titleSelectors = sels.title || ['h1', 'h2', '.title', '#title'];
|
|
for (const sel of titleSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el) {
|
|
data.title = el.innerText;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to extract main content
|
|
const contentSelectors = sels.content || ['article', 'main', '.content', '#content'];
|
|
for (const sel of contentSelectors) {
|
|
const el = document.querySelector(sel);
|
|
if (el) {
|
|
data.content = el.innerText.substring(0, 1000);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract links
|
|
const links = Array.from(document.querySelectorAll('a')).map(a => ({
|
|
text: a.innerText.substring(0, 100),
|
|
href: a.href
|
|
})).slice(0, 20);
|
|
data.links = links;
|
|
|
|
return data;
|
|
}, selectors);
|
|
|
|
console.log(`\n📄 Scraped Content:`);
|
|
console.log(` Title: ${content.title || 'N/A'}`);
|
|
console.log(` Content Length: ${content.content?.length || 0} chars`);
|
|
console.log(` Links Found: ${content.links?.length || 0}\n`);
|
|
|
|
// Simulate reading/interaction
|
|
await simulateReading(page, 4000);
|
|
|
|
return {
|
|
url,
|
|
success: true,
|
|
content
|
|
};
|
|
|
|
} catch (error) {
|
|
console.error(`❌ Error scraping: ${error.message}`);
|
|
return {
|
|
url,
|
|
success: false,
|
|
error: error.message
|
|
};
|
|
} finally {
|
|
await page.close();
|
|
await context.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main function
|
|
*/
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
if (args.length === 0) {
|
|
console.log(`
|
|
Usage:
|
|
node scripts/playwright-scraper.js "your search query"
|
|
node scripts/playwright-scraper.js --url "https://example.com"
|
|
|
|
Examples:
|
|
node scripts/playwright-scraper.js '"macbook repair" Toronto'
|
|
node scripts/playwright-scraper.js --url "https://www.reddit.com/r/toronto"
|
|
`);
|
|
process.exit(0);
|
|
}
|
|
|
|
// Launch browser with anti-detection args
|
|
console.log('🚀 Launching browser...\n');
|
|
const browser = await chromium.launch({
|
|
headless: false, // Set to true for production
|
|
slowMo: 50, // Slight delay between actions (more human-like)
|
|
args: [
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process'
|
|
]
|
|
});
|
|
|
|
try {
|
|
if (args[0] === '--url' && args[1]) {
|
|
// Scrape a specific URL
|
|
const result = await scrapeWebsite(browser, args[1]);
|
|
console.log('\n' + JSON.stringify(result, null, 2));
|
|
} else {
|
|
// Validate a search query
|
|
const query = args.join(' ').replace(/^["']|["']$/g, '');
|
|
const result = await validateQuery(browser, query);
|
|
console.log('\n' + JSON.stringify(result, null, 2));
|
|
}
|
|
} finally {
|
|
await browser.close();
|
|
console.log('\n✅ Browser closed\n');
|
|
}
|
|
}
|
|
|
|
// Run if called directly
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main().catch(console.error);
|
|
}
|
|
|
|
export { validateQuery, scrapeWebsite, searchGoogle, extractResults };
|
|
|