ploughshares/docker/crawler/marketline_crawler.py

138 lines
7.6 KiB
Python

# NOT USED CURRENTLY
# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
# the handoff is very similar anyways
import asyncio
from playwright.async_api import async_playwright
import json
import os
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.deep_crawling.filters import URLPatternFilter
from datetime import datetime
import logging
import time
# --- CONFIGURATION ---
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
COOKIES_FILE = "marketline_cookies.json"
# --- CRAWLER SETTINGS ---
DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
COUNT = 50
# UPDATED: Expanded keywords to better score article pages
SCRAPER_KEYWORDS = [
"arms", "weapons", "military", "defence", "defense", "aerospace",
"canadian armed forces", "caf", "dnd", "global affairs canada",
"export", "sale", "contract", "procurement", "acquisition",
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
]
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
async def login_and_save_cookies():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()
try:
logging.info("Starting login process... Please complete login in the browser.")
await page.goto(LOGIN_URL)
time.sleep(45)
# await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
logging.info("Login successful. Saving session cookies...")
cookies = await context.cookies()
with open(COOKIES_FILE, "w") as f:
json.dump(cookies, f, indent=2)
logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
except Exception as e:
logging.error(f"Login failed: {e}")
finally:
await context.close()
await browser.close()
def save_results_to_json(successful_data, failed_pages):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"crawl_results_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Saving results to '{output_dir}' directory...")
successful_file = os.path.join(output_dir, "successful_pages.json")
with open(successful_file, "w", encoding="utf-8") as f:
json.dump(successful_data, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")
if failed_pages:
failed_file = os.path.join(output_dir, "failed_pages.json")
with open(failed_file, "w", encoding="utf-8") as f:
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
async def crawl_with_saved_cookies():
if not os.path.exists(COOKIES_FILE):
logging.warning("No cookies found. Running login first...")
await login_and_save_cookies()
if not os.path.exists(COOKIES_FILE):
logging.error("Login failed or was aborted. Exiting.")
return
with open(COOKIES_FILE, "r") as f:
try:
cookies = json.load(f)
except json.JSONDecodeError:
logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
return
logging.info(f"Loaded {len(cookies)} cookies for crawling.")
browser_config = BrowserConfig(cookies=cookies, headless=False)
# NEW: Define a filter to only follow links that are news articles
article_filter = URLPatternFilter(patterns=[r"/news/"])
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=DEPTH,
max_pages=COUNT,
#url_scorer=(),
# url_filters=[article_filter] # UPDATED: Add the filter to the strategy
),
# scraping_strategy=
# LXMLWebScrapingStrategy(),
verbose=True, stream=True, page_timeout=120000,
wait_until="domcontentloaded"
)
successful_data = []
failed_pages = []
logging.info("Starting crawl...")
async with AsyncWebCrawler(config=browser_config) as crawler:
# We start with the list page. The filter will ensure we only crawl article links from it.
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
if result.success:
print("RESIULT:", result)
score = result.metadata.get("score", 0)
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
successful_data.append({
"url": result.url, "content": result.markdown,
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
"timestamp": datetime.now().isoformat()
})
else:
print(f"❌ Failed: {result.url} - {result.error_message}")
failed_pages.append({'url': result.url, 'error': result.error_message})
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
save_results_to_json(successful_data, failed_pages)
if __name__ == "__main__":
asyncio.run(crawl_with_saved_cookies())