import asyncio from playwright.async_api import async_playwright, Page import json import os from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig from crawl4ai.deep_crawling import BFSDeepCrawlStrategy from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy from crawl4ai.processors.pdf import PDFContentScrapingStrategy from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.deep_crawling.filters import URLPatternFilter from datetime import datetime # --- CONFIGURATION --- # TODO: this will need to change for different organizations (ie univiersities) # make this the link for university login when accessing marketline LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true" # shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home" # the root page to seed crawling CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001" # name of file where cookies are saved COOKIES_FILE = "marketline_cookies.json" # --- CRAWLER SETTINGS --- DEPTH = 2 COUNT = 10 # Increased for better testing # TODO: maybe make this list more comprehensive? SCRAPER_KEYWORDS = [ # Core Terms "arms export", "arms sale", "arms trade", "weapons export", "weapons deal", "military export", "defence contract", "defense contract", # Canadian Context "canadian armed forces", "global affairs canada", "canadian defence", "canadian military", "royal canadian navy", "royal canadian air force", # Equipment & Technology "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", "frigate", "fighter jet", "military aircraft", "surveillance", "radar", "artillery", "munitions", "firearms", "aerospace", # Action & Policy Terms "procurement", "acquisition", "military aid", "export permit", "itar" ] # runs login process and saves cookies so that we can run the scraping with authentication async def login_and_save_cookies(): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context() page = await context.new_page() try: await page.goto(LOGIN_URL) await page.wait_for_url(HOMEPAGE_URL, timeout=300000) print("Login detected. Saving session cookies...") cookies = await context.cookies() with open(COOKIES_FILE, "w") as f: json.dump(cookies, f) print("Cookies saved successfully!") await crawl_with_saved_cookies() except Exception as e: print(f"Login failed: {e}") print("Error details:") print(await page.content()) finally: await context.close() await browser.close() def save_results_to_json(successful_data, failed_pages): """ Saves the successful and failed crawl results into separate JSON files in a dedicated directory. """ output_dir = "crawl_results" os.makedirs(output_dir, exist_ok=True) print(f"\nšŸ’¾ Saving results to '{output_dir}' directory...") # Define file paths successful_file = os.path.join(output_dir, "successful_pages.json") failed_file = os.path.join(output_dir, "failed_pages.json") # Save successfully scraped data with open(successful_file, "w", encoding="utf-8") as f: json.dump(successful_data, f, indent=4, ensure_ascii=False) print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'") # Save failed pages if any if failed_pages: with open(failed_file, "w", encoding="utf-8") as f: json.dump(failed_pages, f, indent=4, ensure_ascii=False) print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'") # runs the crawler with the cookies collected during login async def crawl_with_saved_cookies(): if not os.path.exists(COOKIES_FILE): print("No cookies found. Please run login first.") return with open(COOKIES_FILE, "r") as f: cookies = json.load(f) browser_config = BrowserConfig(cookies=cookies) config = CrawlerRunConfig( deep_crawl_strategy=BFSDeepCrawlStrategy( max_depth=DEPTH, max_pages=COUNT, url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,), ), scraping_strategy=LXMLWebScrapingStrategy(), # TODO: scrape the PDFs better # scraping_strategy=PDFCrawlerStrategy(), verbose=True, stream=True, page_timeout=30000 ) successful_data = [] failed_pages = [] async with AsyncWebCrawler(config=browser_config) as crawler: async for result in await crawler.arun(CRAWLPAGE_URL, config=config): if result.success: depth = result.metadata.get("depth", 0) score = result.metadata.get("score", 0) # here we could look at a few things, the HTML, markdown, raw text, etc. scraped_content = result.markdown print(f"āœ… Depth {depth} | Score: {score:.2f} | {result.url}") # NEW: Print a preview of the content to confirm it's being scraped print(f" šŸ“„ Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...") successful_data.append({ "url": result.url, "content": scraped_content, "depth": depth, "score": round(score, 2) }) else: failed_pages.append({ 'url': result.url, 'error': result.error_message, 'depth': result.metadata.get("depth", 0) }) print(f"āŒ Failed: {result.url} - {result.error_message}") print(f"šŸ“Š Results: {len(successful_data)} successful, {len(failed_pages)} failed") save_results_to_json(successful_data, failed_pages) # Analyze failures by depth if failed_pages: failure_by_depth = {} for failure in failed_pages: depth = failure['depth'] failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1 print("āŒ Failures by depth:") for depth, count in sorted(failure_by_depth.items()): print(f" Depth {depth}: {count} failures") if __name__ == "__main__": # Choose which function to run # 1. First, run the login function once to get your cookies asyncio.run(login_and_save_cookies()) # 2. Then, comment out the login line and run the crawl # asyncio.run(crawl_with_saved_cookies())