179 lines
7.3 KiB
Python
179 lines
7.3 KiB
Python
# like the crawler but with a session hand off instead of a cookies sharing approach
|
||
# opens non-headless browser, user logs in and does captchas and then hands off to scraper
|
||
|
||
# more reliable, easier to debug and captcha resistant
|
||
|
||
import asyncio
|
||
from crawl4ai import UndetectedAdapter
|
||
from itertools import chain
|
||
from playwright.async_api import async_playwright
|
||
import json
|
||
import os
|
||
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
||
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
||
from datetime import datetime
|
||
import logging
|
||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
|
||
|
||
# --- CONFIGURATION ---
|
||
# MODIFIED: Only the login URL is needed for the initial navigation.
|
||
# The user will navigate to the crawl starting page manually.
|
||
LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
|
||
|
||
# --- CRAWLER SETTINGS ---
|
||
DEPTH = 2
|
||
COUNT = 50
|
||
SCRAPER_KEYWORDS = [
|
||
"arms", "weapons", "military", "defence", "defense", "aerospace",
|
||
"canadian armed forces", "caf", "dnd", "global affairs canada",
|
||
"export", "sale", "contract", "procurement", "acquisition",
|
||
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
|
||
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
|
||
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
|
||
]
|
||
|
||
# class DebugFilter(BaseFilter):
|
||
# def apply(self, urls):
|
||
# print("\n=== LINKS BEFORE FILTERING ===")
|
||
# for u in urls:
|
||
# print(u)
|
||
# return urls # don’t drop anything
|
||
|
||
include_words = URLPatternFilter(patterns=["*News*", "*news*"])
|
||
deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
|
||
|
||
# --- SETUP LOGGING ---
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
|
||
def save_results_to_json(successful_data, failed_pages):
|
||
"""Saves the crawl results to timestamped JSON files in a new directory."""
|
||
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
||
# could timestamp this but im not cuz its easier to analyze.
|
||
# later we will prolly have one folder with all the timestamped files that we will go through regex'd
|
||
# for now well just overwrite.
|
||
output_dir = f"crawl_results"
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
logging.info(f"Saving results to '{output_dir}' directory...")
|
||
|
||
successful_file = os.path.join(output_dir, "successful_pages.json")
|
||
with open(successful_file, "w", encoding="utf-8") as f:
|
||
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
||
logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
|
||
|
||
if failed_pages:
|
||
failed_file = os.path.join(output_dir, "failed_pages.json")
|
||
with open(failed_file, "w", encoding="utf-8") as f:
|
||
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
||
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
|
||
|
||
async def main():
|
||
"""
|
||
Main function to handle manual login, capture the session state from the
|
||
active tab, and then hand it off to the crawler.
|
||
"""
|
||
# --- STEP 1: Manual Login in a Temporary Browser ---
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=False)
|
||
context = await browser.new_context()
|
||
page = await context.new_page() # This is the initial page
|
||
|
||
logging.info("A browser window has opened. Please complete the following steps:")
|
||
logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
|
||
logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
|
||
await page.goto(LOGIN_URL)
|
||
|
||
input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
|
||
|
||
# MODIFIED: Instead of using the original 'page' object, get the current active tab.
|
||
# This correctly handles cases where the login process opens a new tab.
|
||
print("ALL PAGES:")
|
||
for page in context.pages:
|
||
print("URL: ", page.url)
|
||
active_page = context.pages[-1]
|
||
start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
|
||
|
||
logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
|
||
|
||
# Capture the full session state (cookies, localStorage, etc.)
|
||
storage_state = await context.storage_state()
|
||
|
||
# We no longer need this temporary browser.
|
||
await browser.close()
|
||
|
||
# --- STEP 2: Configure and Run the Crawler with the Captured State ---
|
||
adapter = UndetectedAdapter()
|
||
# Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
|
||
browser_config = BrowserConfig(
|
||
# enable_stealth=True,
|
||
headless=False,
|
||
storage_state=storage_state # This injects your logged-in session.
|
||
)
|
||
|
||
scorer = KeywordRelevanceScorer(
|
||
keywords=SCRAPER_KEYWORDS,
|
||
weight=0.7
|
||
)
|
||
|
||
filter = FilterChain([
|
||
# DebugFilter(),
|
||
include_words,
|
||
deny_words
|
||
])
|
||
|
||
# This configuration remains the same
|
||
config = CrawlerRunConfig(
|
||
|
||
|
||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||
max_depth=DEPTH,
|
||
max_pages=COUNT,
|
||
url_scorer=scorer,
|
||
filter_chain=filter
|
||
),
|
||
verbose=True,
|
||
stream=True,
|
||
page_timeout=120000,
|
||
wait_until="domcontentloaded"
|
||
)
|
||
|
||
successful_data = []
|
||
failed_pages = []
|
||
|
||
logging.info("Starting crawler with the captured session state...")
|
||
|
||
async with AsyncWebCrawler(config=browser_config, browser_adapter=adapter) as crawler:
|
||
# The crawler will now begin at the correct URL you navigated to.
|
||
async for result in await crawler.arun(start_url, config=config):
|
||
if result.success:
|
||
all_links = [
|
||
l["href"]
|
||
for l in chain(result.links.get("internal", []), result.links.get("external", []))
|
||
]
|
||
|
||
print(f"✅ Scraped: {result.url}")
|
||
print("Filtered links:")
|
||
|
||
# Apply filters one URL at a time
|
||
for url in all_links:
|
||
if include_words.apply(url) and deny_words.apply(url):
|
||
print(" ->", url)
|
||
score = result.metadata.get("score", 0)
|
||
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
|
||
successful_data.append({
|
||
"url": result.url, "content": result.markdown,
|
||
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
|
||
"timestamp": datetime.now().isoformat()
|
||
})
|
||
else:
|
||
print(f"❌ Failed: {result.url} - {result.error_message}")
|
||
failed_pages.append({'url': result.url, 'error': result.error_message})
|
||
|
||
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
|
||
save_results_to_json(successful_data, failed_pages)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main()) |