ploughshares/docker/crawler/marketline_handoff.py

179 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# like the crawler but with a session hand off instead of a cookies sharing approach
# opens non-headless browser, user logs in and does captchas and then hands off to scraper
# more reliable, easier to debug and captcha resistant
import asyncio
from crawl4ai import UndetectedAdapter
from itertools import chain
from playwright.async_api import async_playwright
import json
import os
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import URLPatternFilter
from datetime import datetime
import logging
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
# --- CONFIGURATION ---
# MODIFIED: Only the login URL is needed for the initial navigation.
# The user will navigate to the crawl starting page manually.
LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
# --- CRAWLER SETTINGS ---
DEPTH = 2
COUNT = 50
SCRAPER_KEYWORDS = [
"arms", "weapons", "military", "defence", "defense", "aerospace",
"canadian armed forces", "caf", "dnd", "global affairs canada",
"export", "sale", "contract", "procurement", "acquisition",
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
]
# class DebugFilter(BaseFilter):
# def apply(self, urls):
# print("\n=== LINKS BEFORE FILTERING ===")
# for u in urls:
# print(u)
# return urls # dont drop anything
include_words = URLPatternFilter(patterns=["*News*", "*news*"])
deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
# --- SETUP LOGGING ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def save_results_to_json(successful_data, failed_pages):
"""Saves the crawl results to timestamped JSON files in a new directory."""
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# could timestamp this but im not cuz its easier to analyze.
# later we will prolly have one folder with all the timestamped files that we will go through regex'd
# for now well just overwrite.
output_dir = f"crawl_results"
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Saving results to '{output_dir}' directory...")
successful_file = os.path.join(output_dir, "successful_pages.json")
with open(successful_file, "w", encoding="utf-8") as f:
json.dump(successful_data, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
if failed_pages:
failed_file = os.path.join(output_dir, "failed_pages.json")
with open(failed_file, "w", encoding="utf-8") as f:
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
async def main():
"""
Main function to handle manual login, capture the session state from the
active tab, and then hand it off to the crawler.
"""
# --- STEP 1: Manual Login in a Temporary Browser ---
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page() # This is the initial page
logging.info("A browser window has opened. Please complete the following steps:")
logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
await page.goto(LOGIN_URL)
input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
# MODIFIED: Instead of using the original 'page' object, get the current active tab.
# This correctly handles cases where the login process opens a new tab.
print("ALL PAGES:")
for page in context.pages:
print("URL: ", page.url)
active_page = context.pages[-1]
start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
# Capture the full session state (cookies, localStorage, etc.)
storage_state = await context.storage_state()
# We no longer need this temporary browser.
await browser.close()
# --- STEP 2: Configure and Run the Crawler with the Captured State ---
adapter = UndetectedAdapter()
# Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
browser_config = BrowserConfig(
# enable_stealth=True,
headless=False,
storage_state=storage_state # This injects your logged-in session.
)
scorer = KeywordRelevanceScorer(
keywords=SCRAPER_KEYWORDS,
weight=0.7
)
filter = FilterChain([
# DebugFilter(),
include_words,
deny_words
])
# This configuration remains the same
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=DEPTH,
max_pages=COUNT,
url_scorer=scorer,
filter_chain=filter
),
verbose=True,
stream=True,
page_timeout=120000,
wait_until="domcontentloaded"
)
successful_data = []
failed_pages = []
logging.info("Starting crawler with the captured session state...")
async with AsyncWebCrawler(config=browser_config, browser_adapter=adapter) as crawler:
# The crawler will now begin at the correct URL you navigated to.
async for result in await crawler.arun(start_url, config=config):
if result.success:
all_links = [
l["href"]
for l in chain(result.links.get("internal", []), result.links.get("external", []))
]
print(f"✅ Scraped: {result.url}")
print("Filtered links:")
# Apply filters one URL at a time
for url in all_links:
if include_words.apply(url) and deny_words.apply(url):
print(" ->", url)
score = result.metadata.get("score", 0)
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
successful_data.append({
"url": result.url, "content": result.markdown,
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
"timestamp": datetime.now().isoformat()
})
else:
print(f"❌ Failed: {result.url} - {result.error_message}")
failed_pages.append({'url': result.url, 'error': result.error_message})
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
save_results_to_json(successful_data, failed_pages)
if __name__ == "__main__":
asyncio.run(main())