138 lines
7.6 KiB
Python
138 lines
7.6 KiB
Python
# NOT USED CURRENTLY
|
|
# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
|
|
# the handoff is very similar anyways
|
|
|
|
import asyncio
|
|
from playwright.async_api import async_playwright
|
|
import json
|
|
import os
|
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
|
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
|
from datetime import datetime
|
|
import logging
|
|
import time
|
|
|
|
# --- CONFIGURATION ---
|
|
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
|
|
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
|
|
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
|
|
CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
|
|
COOKIES_FILE = "marketline_cookies.json"
|
|
|
|
# --- CRAWLER SETTINGS ---
|
|
DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
|
|
COUNT = 50
|
|
# UPDATED: Expanded keywords to better score article pages
|
|
SCRAPER_KEYWORDS = [
|
|
"arms", "weapons", "military", "defence", "defense", "aerospace",
|
|
"canadian armed forces", "caf", "dnd", "global affairs canada",
|
|
"export", "sale", "contract", "procurement", "acquisition",
|
|
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
|
|
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
|
|
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
|
|
]
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
async def login_and_save_cookies():
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False)
|
|
context = await browser.new_context()
|
|
page = await context.new_page()
|
|
try:
|
|
logging.info("Starting login process... Please complete login in the browser.")
|
|
await page.goto(LOGIN_URL)
|
|
time.sleep(45)
|
|
# await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
|
|
logging.info("Login successful. Saving session cookies...")
|
|
cookies = await context.cookies()
|
|
with open(COOKIES_FILE, "w") as f:
|
|
json.dump(cookies, f, indent=2)
|
|
logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
|
|
except Exception as e:
|
|
logging.error(f"Login failed: {e}")
|
|
finally:
|
|
await context.close()
|
|
await browser.close()
|
|
|
|
def save_results_to_json(successful_data, failed_pages):
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_dir = f"crawl_results_{timestamp}"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
logging.info(f"Saving results to '{output_dir}' directory...")
|
|
|
|
successful_file = os.path.join(output_dir, "successful_pages.json")
|
|
with open(successful_file, "w", encoding="utf-8") as f:
|
|
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
|
logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")
|
|
|
|
if failed_pages:
|
|
failed_file = os.path.join(output_dir, "failed_pages.json")
|
|
with open(failed_file, "w", encoding="utf-8") as f:
|
|
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
|
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
|
|
|
|
async def crawl_with_saved_cookies():
|
|
if not os.path.exists(COOKIES_FILE):
|
|
logging.warning("No cookies found. Running login first...")
|
|
await login_and_save_cookies()
|
|
if not os.path.exists(COOKIES_FILE):
|
|
logging.error("Login failed or was aborted. Exiting.")
|
|
return
|
|
|
|
with open(COOKIES_FILE, "r") as f:
|
|
try:
|
|
cookies = json.load(f)
|
|
except json.JSONDecodeError:
|
|
logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
|
|
return
|
|
|
|
logging.info(f"Loaded {len(cookies)} cookies for crawling.")
|
|
browser_config = BrowserConfig(cookies=cookies, headless=False)
|
|
|
|
# NEW: Define a filter to only follow links that are news articles
|
|
article_filter = URLPatternFilter(patterns=[r"/news/"])
|
|
|
|
config = CrawlerRunConfig(
|
|
|
|
|
|
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
|
max_depth=DEPTH,
|
|
max_pages=COUNT,
|
|
#url_scorer=(),
|
|
# url_filters=[article_filter] # UPDATED: Add the filter to the strategy
|
|
),
|
|
# scraping_strategy=
|
|
# LXMLWebScrapingStrategy(),
|
|
verbose=True, stream=True, page_timeout=120000,
|
|
wait_until="domcontentloaded"
|
|
)
|
|
|
|
successful_data = []
|
|
failed_pages = []
|
|
|
|
logging.info("Starting crawl...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# We start with the list page. The filter will ensure we only crawl article links from it.
|
|
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
|
|
if result.success:
|
|
print("RESIULT:", result)
|
|
score = result.metadata.get("score", 0)
|
|
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
|
|
successful_data.append({
|
|
"url": result.url, "content": result.markdown,
|
|
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
|
|
"timestamp": datetime.now().isoformat()
|
|
})
|
|
else:
|
|
print(f"❌ Failed: {result.url} - {result.error_message}")
|
|
failed_pages.append({'url': result.url, 'error': result.error_message})
|
|
|
|
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
|
|
save_results_to_json(successful_data, failed_pages)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(crawl_with_saved_cookies()) |