186 lines
7.8 KiB
Python
186 lines
7.8 KiB
Python
import asyncio
|
|
from playwright.async_api import async_playwright, Page
|
|
import json
|
|
import os
|
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
|
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
|
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
|
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
|
from datetime import datetime
|
|
|
|
# --- CONFIGURATION ---
|
|
|
|
# TODO: this will need to change for different organizations (ie univiersities)
|
|
|
|
# make this the link for university login when accessing marketline
|
|
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
|
|
|
|
# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies.
|
|
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
|
|
|
|
# the root page to seed crawling
|
|
CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
|
|
# trying out another page
|
|
# CRAWLPAGE_URL = "https://www.defensenews.com/"
|
|
|
|
|
|
# name of file where cookies are saved
|
|
COOKIES_FILE = "marketline_cookies.json"
|
|
|
|
# --- CRAWLER SETTINGS ---
|
|
DEPTH = 3
|
|
COUNT = 100
|
|
|
|
# TODO: maybe make this list more comprehensive?
|
|
SCRAPER_KEYWORDS = [
|
|
# Core Terms
|
|
"arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
|
|
"military export", "defence contract", "defense contract",
|
|
|
|
# Canadian Context
|
|
"canadian armed forces", "global affairs canada", "canadian defence",
|
|
"canadian military", "royal canadian navy", "royal canadian air force",
|
|
|
|
# Equipment & Technology
|
|
"armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship",
|
|
"frigate", "fighter jet", "military aircraft", "surveillance", "radar",
|
|
"artillery", "munitions", "firearms", "aerospace",
|
|
|
|
# Action & Policy Terms
|
|
"procurement", "acquisition", "military aid", "export permit", "itar"
|
|
]
|
|
|
|
# runs login process and saves cookies so that we can run the scraping with authentication
|
|
async def login_and_save_cookies():
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
context = await browser.new_context()
|
|
page = await context.new_page()
|
|
|
|
try:
|
|
await page.goto(LOGIN_URL)
|
|
await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
|
|
|
|
print("Login detected. Saving session cookies...")
|
|
cookies = await context.cookies()
|
|
with open(COOKIES_FILE, "w") as f:
|
|
json.dump(cookies, f)
|
|
|
|
print("Cookies saved successfully!")
|
|
await crawl_with_saved_cookies()
|
|
|
|
except Exception as e:
|
|
print(f"Login failed: {e}")
|
|
print("Error details:")
|
|
print(await page.content())
|
|
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
def save_results_to_json(successful_data, failed_pages):
|
|
"""
|
|
Saves the successful and failed crawl results into separate JSON files
|
|
in a dedicated directory.
|
|
"""
|
|
output_dir = "crawl_results"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
print(f"\n💾 Saving results to '{output_dir}' directory...")
|
|
|
|
# Define file paths
|
|
successful_file = os.path.join(output_dir, "successful_pages.json")
|
|
failed_file = os.path.join(output_dir, "failed_pages.json")
|
|
|
|
# Save successfully scraped data
|
|
with open(successful_file, "w", encoding="utf-8") as f:
|
|
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
|
print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'")
|
|
|
|
# Save failed pages if any
|
|
if failed_pages:
|
|
with open(failed_file, "w", encoding="utf-8") as f:
|
|
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
|
print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
|
|
|
|
|
|
# runs the crawler with the cookies collected during login
|
|
async def crawl_with_saved_cookies():
|
|
|
|
if not os.path.exists(COOKIES_FILE):
|
|
print("No cookies found. Please run login first.")
|
|
return
|
|
|
|
with open(COOKIES_FILE, "r") as f:
|
|
cookies = json.load(f)
|
|
|
|
browser_config = BrowserConfig(cookies=cookies)
|
|
|
|
config = CrawlerRunConfig(
|
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
max_depth=DEPTH,
|
|
max_pages=COUNT,
|
|
url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
|
|
),
|
|
scraping_strategy=LXMLWebScrapingStrategy(),
|
|
# TODO: scrape the PDFs better
|
|
# scraping_strategy=PDFCrawlerStrategy(),
|
|
verbose=True,
|
|
stream=True,
|
|
page_timeout=30000
|
|
)
|
|
|
|
successful_data = []
|
|
failed_pages = []
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
|
|
if result.success:
|
|
depth = result.metadata.get("depth", 0)
|
|
score = result.metadata.get("score", 0)
|
|
|
|
# here we could look at a few things, the HTML, markdown, raw text, etc.
|
|
scraped_content = result.markdown
|
|
|
|
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
|
|
# NEW: Print a preview of the content to confirm it's being scraped
|
|
print(f" 📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
|
|
|
|
successful_data.append({
|
|
"url": result.url,
|
|
"content": scraped_content,
|
|
"depth": depth,
|
|
"score": round(score, 2)
|
|
})
|
|
else:
|
|
failed_pages.append({
|
|
'url': result.url,
|
|
'error': result.error_message,
|
|
'depth': result.metadata.get("depth", 0)
|
|
})
|
|
print(f"❌ Failed: {result.url} - {result.error_message}")
|
|
|
|
print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
|
|
|
|
save_results_to_json(successful_data, failed_pages)
|
|
|
|
# Analyze failures by depth
|
|
if failed_pages:
|
|
failure_by_depth = {}
|
|
for failure in failed_pages:
|
|
depth = failure['depth']
|
|
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
|
|
|
|
print("❌ Failures by depth:")
|
|
for depth, count in sorted(failure_by_depth.items()):
|
|
print(f" Depth {depth}: {count} failures")
|
|
|
|
if __name__ == "__main__":
|
|
# Choose which function to run
|
|
# 1. First, run the login function once to get your cookies
|
|
# asyncio.run(login_and_save_cookies())
|
|
|
|
# 2. Then, comment out the login line and run the crawl
|
|
asyncio.run(crawl_with_saved_cookies()) |