rss-feedmonitor/scripts/scraper-config.js

124 lines
4.3 KiB
JavaScript

/**
* Configuration for Playwright scraper and human behavior
* Adjust these values to fine-tune bot detection avoidance
*/
export const config = {
// Browser settings
browser: {
headless: false, // Set to true for production
slowMo: 50, // Milliseconds to slow down actions
timeout: 30000, // Default timeout for operations
},
// Human behavior parameters
humanBehavior: {
// Mouse movement
mouse: {
overshootChance: 0.15, // Probability of overshooting target (0-1)
overshootDistance: 20, // Max pixels to overshoot
pathSteps: 25, // Number of steps in bezier curve
stepDelay: 10, // Milliseconds between movement steps
},
// Scrolling behavior
scroll: {
minAmount: 100, // Minimum pixels per scroll
maxAmount: 400, // Maximum pixels per scroll
minDelay: 500, // Minimum delay between scrolls (ms)
maxDelay: 2000, // Maximum delay between scrolls (ms)
randomDirectionChance: 0.15, // Chance to scroll opposite direction
smoothIncrements: [5, 12], // Range of increments for smooth scrolling
},
// Typing behavior
typing: {
minDelay: 50, // Minimum delay between keystrokes (ms)
maxDelay: 150, // Maximum delay between keystrokes (ms)
mistakeChance: 0.02, // Probability of typo (0-1)
pauseOnSpace: 1.5, // Multiplier for pause after space
pauseOnPunctuation: 2.0, // Multiplier for pause after punctuation
},
// Clicking behavior
clicking: {
preClickDelay: [100, 300], // Range for pause before click
postClickDelay: [200, 500], // Range for pause after click
doubleClickChance: 0.02, // Probability of accidental double-click
clickOffset: [0.3, 0.7], // Click position within element (fraction)
},
// General timing
timing: {
pageLoadWait: [1000, 3000], // Wait after page load
readingSimulation: 5000, // Duration to simulate reading
delayBetweenActions: [100, 500], // General action delays
},
},
// Viewport configurations (randomly selected)
viewports: [
{ width: 1920, height: 1080 },
{ width: 1366, height: 768 },
{ width: 1536, height: 864 },
{ width: 1440, height: 900 },
{ width: 2560, height: 1440 },
],
// User agent strings (randomly selected)
userAgents: [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
],
// Geolocation (Toronto by default)
geolocation: {
latitude: 43.6532,
longitude: -79.3832,
},
// Locale settings
locale: {
language: 'en-CA',
timezone: 'America/Toronto',
},
// Validation settings
validation: {
maxAlertsToTest: 5, // Maximum alerts to test in batch
delayBetweenTests: 12000, // Delay between alert tests (ms) - increased for politeness
randomizeOrder: true, // Randomize test order
saveReports: true, // Save validation reports to file
saveNotes: true, // Save detailed notes in markdown
},
// Rate limiting and safety
rateLimiting: {
requestsPerMinute: 10, // Max requests per minute
cooldownAfter: 5, // Cooldown after N requests
cooldownDuration: 60000, // Cooldown duration (ms)
},
// Scraping targets
targets: {
google: {
searchUrl: 'https://www.google.com',
resultSelector: 'div.g, div[data-sokoban-container]',
titleSelector: 'h3',
linkSelector: 'a',
snippetSelectors: ['div[data-content-feature]', '.VwiC3b', '.s'],
},
reddit: {
postSelector: '.Post',
titleSelector: 'h3',
contentSelector: 'div[data-click-id="text"]',
},
},
};
export default config;