scraper handoff + updated reqs
This commit is contained in:
parent
725f028d69
commit
b1f3115999
|
@ -1,2 +1,4 @@
|
||||||
.env
|
.env
|
||||||
marketline_cookies.json
|
marketline_cookies.json
|
||||||
|
venv/
|
||||||
|
marketline_session/
|
|
@ -240,8 +240,8 @@ def main():
|
||||||
|
|
||||||
# basic required-field check (we want the API-required fields present)
|
# basic required-field check (we want the API-required fields present)
|
||||||
if not is_valid_transaction(tx):
|
if not is_valid_transaction(tx):
|
||||||
print(" ⚠️ Skipping — missing required API fields in extracted transaction:", tx)
|
print(" ⚠️ missing required API fields in extracted transaction:", tx)
|
||||||
continue
|
#continue
|
||||||
|
|
||||||
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
|
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
|
||||||
# Save the item
|
# Save the item
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
*.json
|
*.json
|
||||||
|
|
|
@ -1,186 +1,138 @@
|
||||||
|
# NOT USED CURRENTLY
|
||||||
|
# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
|
||||||
|
# the handoff is very similar anyways
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from playwright.async_api import async_playwright, Page
|
from playwright.async_api import async_playwright
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||||
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
|
||||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
# --- CONFIGURATION ---
|
# --- CONFIGURATION ---
|
||||||
|
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
|
||||||
# TODO: this will need to change for different organizations (ie univiersities)
|
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
|
||||||
|
|
||||||
# make this the link for university login when accessing marketline
|
|
||||||
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
|
|
||||||
|
|
||||||
# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies.
|
|
||||||
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
|
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
|
||||||
|
CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
|
||||||
# the root page to seed crawling
|
|
||||||
CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
|
|
||||||
# trying out another page
|
|
||||||
# CRAWLPAGE_URL = "https://www.defensenews.com/"
|
|
||||||
|
|
||||||
|
|
||||||
# name of file where cookies are saved
|
|
||||||
COOKIES_FILE = "marketline_cookies.json"
|
COOKIES_FILE = "marketline_cookies.json"
|
||||||
|
|
||||||
# --- CRAWLER SETTINGS ---
|
# --- CRAWLER SETTINGS ---
|
||||||
DEPTH = 3
|
DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
|
||||||
COUNT = 100
|
COUNT = 50
|
||||||
|
# UPDATED: Expanded keywords to better score article pages
|
||||||
# TODO: maybe make this list more comprehensive?
|
|
||||||
SCRAPER_KEYWORDS = [
|
SCRAPER_KEYWORDS = [
|
||||||
# Core Terms
|
"arms", "weapons", "military", "defence", "defense", "aerospace",
|
||||||
"arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
|
"canadian armed forces", "caf", "dnd", "global affairs canada",
|
||||||
"military export", "defence contract", "defense contract",
|
"export", "sale", "contract", "procurement", "acquisition",
|
||||||
|
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
|
||||||
# Canadian Context
|
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
|
||||||
"canadian armed forces", "global affairs canada", "canadian defence",
|
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
|
||||||
"canadian military", "royal canadian navy", "royal canadian air force",
|
|
||||||
|
|
||||||
# Equipment & Technology
|
|
||||||
"armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship",
|
|
||||||
"frigate", "fighter jet", "military aircraft", "surveillance", "radar",
|
|
||||||
"artillery", "munitions", "firearms", "aerospace",
|
|
||||||
|
|
||||||
# Action & Policy Terms
|
|
||||||
"procurement", "acquisition", "military aid", "export permit", "itar"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# runs login process and saves cookies so that we can run the scraping with authentication
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
async def login_and_save_cookies():
|
|
||||||
|
|
||||||
|
async def login_and_save_cookies():
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=False)
|
browser = await p.chromium.launch(headless=False)
|
||||||
context = await browser.new_context()
|
context = await browser.new_context()
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logging.info("Starting login process... Please complete login in the browser.")
|
||||||
await page.goto(LOGIN_URL)
|
await page.goto(LOGIN_URL)
|
||||||
await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
|
time.sleep(45)
|
||||||
|
# await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
|
||||||
print("Login detected. Saving session cookies...")
|
logging.info("Login successful. Saving session cookies...")
|
||||||
cookies = await context.cookies()
|
cookies = await context.cookies()
|
||||||
with open(COOKIES_FILE, "w") as f:
|
with open(COOKIES_FILE, "w") as f:
|
||||||
json.dump(cookies, f)
|
json.dump(cookies, f, indent=2)
|
||||||
|
logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
|
||||||
print("Cookies saved successfully!")
|
|
||||||
await crawl_with_saved_cookies()
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Login failed: {e}")
|
logging.error(f"Login failed: {e}")
|
||||||
print("Error details:")
|
|
||||||
print(await page.content())
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
await context.close()
|
await context.close()
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
def save_results_to_json(successful_data, failed_pages):
|
def save_results_to_json(successful_data, failed_pages):
|
||||||
"""
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
Saves the successful and failed crawl results into separate JSON files
|
output_dir = f"crawl_results_{timestamp}"
|
||||||
in a dedicated directory.
|
|
||||||
"""
|
|
||||||
output_dir = "crawl_results"
|
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
print(f"\n💾 Saving results to '{output_dir}' directory...")
|
logging.info(f"Saving results to '{output_dir}' directory...")
|
||||||
|
|
||||||
# Define file paths
|
|
||||||
successful_file = os.path.join(output_dir, "successful_pages.json")
|
successful_file = os.path.join(output_dir, "successful_pages.json")
|
||||||
failed_file = os.path.join(output_dir, "failed_pages.json")
|
|
||||||
|
|
||||||
# Save successfully scraped data
|
|
||||||
with open(successful_file, "w", encoding="utf-8") as f:
|
with open(successful_file, "w", encoding="utf-8") as f:
|
||||||
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
||||||
print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'")
|
logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")
|
||||||
|
|
||||||
# Save failed pages if any
|
|
||||||
if failed_pages:
|
if failed_pages:
|
||||||
|
failed_file = os.path.join(output_dir, "failed_pages.json")
|
||||||
with open(failed_file, "w", encoding="utf-8") as f:
|
with open(failed_file, "w", encoding="utf-8") as f:
|
||||||
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
||||||
print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
|
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
|
||||||
|
|
||||||
|
|
||||||
# runs the crawler with the cookies collected during login
|
|
||||||
async def crawl_with_saved_cookies():
|
async def crawl_with_saved_cookies():
|
||||||
|
|
||||||
if not os.path.exists(COOKIES_FILE):
|
if not os.path.exists(COOKIES_FILE):
|
||||||
print("No cookies found. Please run login first.")
|
logging.warning("No cookies found. Running login first...")
|
||||||
return
|
await login_and_save_cookies()
|
||||||
|
if not os.path.exists(COOKIES_FILE):
|
||||||
|
logging.error("Login failed or was aborted. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
with open(COOKIES_FILE, "r") as f:
|
with open(COOKIES_FILE, "r") as f:
|
||||||
cookies = json.load(f)
|
try:
|
||||||
|
cookies = json.load(f)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
|
||||||
|
return
|
||||||
|
|
||||||
browser_config = BrowserConfig(cookies=cookies)
|
logging.info(f"Loaded {len(cookies)} cookies for crawling.")
|
||||||
|
browser_config = BrowserConfig(cookies=cookies, headless=False)
|
||||||
|
|
||||||
|
# NEW: Define a filter to only follow links that are news articles
|
||||||
|
article_filter = URLPatternFilter(patterns=[r"/news/"])
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
||||||
|
|
||||||
|
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||||
max_depth=DEPTH,
|
max_depth=DEPTH,
|
||||||
max_pages=COUNT,
|
max_pages=COUNT,
|
||||||
url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
|
#url_scorer=(),
|
||||||
|
# url_filters=[article_filter] # UPDATED: Add the filter to the strategy
|
||||||
),
|
),
|
||||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
# scraping_strategy=
|
||||||
# TODO: scrape the PDFs better
|
# LXMLWebScrapingStrategy(),
|
||||||
# scraping_strategy=PDFCrawlerStrategy(),
|
verbose=True, stream=True, page_timeout=120000,
|
||||||
verbose=True,
|
wait_until="domcontentloaded"
|
||||||
stream=True,
|
|
||||||
page_timeout=30000
|
|
||||||
)
|
)
|
||||||
|
|
||||||
successful_data = []
|
successful_data = []
|
||||||
failed_pages = []
|
failed_pages = []
|
||||||
|
|
||||||
|
logging.info("Starting crawl...")
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# We start with the list page. The filter will ensure we only crawl article links from it.
|
||||||
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
|
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
|
||||||
if result.success:
|
if result.success:
|
||||||
depth = result.metadata.get("depth", 0)
|
print("RESIULT:", result)
|
||||||
score = result.metadata.get("score", 0)
|
score = result.metadata.get("score", 0)
|
||||||
|
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
|
||||||
# here we could look at a few things, the HTML, markdown, raw text, etc.
|
|
||||||
scraped_content = result.markdown
|
|
||||||
|
|
||||||
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
|
|
||||||
# NEW: Print a preview of the content to confirm it's being scraped
|
|
||||||
print(f" 📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
|
|
||||||
|
|
||||||
successful_data.append({
|
successful_data.append({
|
||||||
"url": result.url,
|
"url": result.url, "content": result.markdown,
|
||||||
"content": scraped_content,
|
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
|
||||||
"depth": depth,
|
"timestamp": datetime.now().isoformat()
|
||||||
"score": round(score, 2)
|
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
failed_pages.append({
|
|
||||||
'url': result.url,
|
|
||||||
'error': result.error_message,
|
|
||||||
'depth': result.metadata.get("depth", 0)
|
|
||||||
})
|
|
||||||
print(f"❌ Failed: {result.url} - {result.error_message}")
|
print(f"❌ Failed: {result.url} - {result.error_message}")
|
||||||
|
failed_pages.append({'url': result.url, 'error': result.error_message})
|
||||||
print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
|
|
||||||
|
|
||||||
|
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
|
||||||
save_results_to_json(successful_data, failed_pages)
|
save_results_to_json(successful_data, failed_pages)
|
||||||
|
|
||||||
# Analyze failures by depth
|
|
||||||
if failed_pages:
|
|
||||||
failure_by_depth = {}
|
|
||||||
for failure in failed_pages:
|
|
||||||
depth = failure['depth']
|
|
||||||
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
|
|
||||||
|
|
||||||
print("❌ Failures by depth:")
|
|
||||||
for depth, count in sorted(failure_by_depth.items()):
|
|
||||||
print(f" Depth {depth}: {count} failures")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Choose which function to run
|
|
||||||
# 1. First, run the login function once to get your cookies
|
|
||||||
# asyncio.run(login_and_save_cookies())
|
|
||||||
|
|
||||||
# 2. Then, comment out the login line and run the crawl
|
|
||||||
asyncio.run(crawl_with_saved_cookies())
|
asyncio.run(crawl_with_saved_cookies())
|
|
@ -0,0 +1,176 @@
|
||||||
|
# like the crawler but with a session hand off instead of a cookies sharing approach
|
||||||
|
# opens non-headless browser, user logs in and does captchas and then hands off to scraper
|
||||||
|
|
||||||
|
# more reliable, easier to debug and captcha resistant
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from itertools import chain
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||||
|
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
||||||
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
|
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
|
||||||
|
|
||||||
|
# --- CONFIGURATION ---
|
||||||
|
# MODIFIED: Only the login URL is needed for the initial navigation.
|
||||||
|
# The user will navigate to the crawl starting page manually.
|
||||||
|
LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
|
||||||
|
|
||||||
|
# --- CRAWLER SETTINGS ---
|
||||||
|
DEPTH = 2
|
||||||
|
COUNT = 50
|
||||||
|
SCRAPER_KEYWORDS = [
|
||||||
|
"arms", "weapons", "military", "defence", "defense", "aerospace",
|
||||||
|
"canadian armed forces", "caf", "dnd", "global affairs canada",
|
||||||
|
"export", "sale", "contract", "procurement", "acquisition",
|
||||||
|
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
|
||||||
|
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
|
||||||
|
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
|
||||||
|
]
|
||||||
|
|
||||||
|
# class DebugFilter(BaseFilter):
|
||||||
|
# def apply(self, urls):
|
||||||
|
# print("\n=== LINKS BEFORE FILTERING ===")
|
||||||
|
# for u in urls:
|
||||||
|
# print(u)
|
||||||
|
# return urls # don’t drop anything
|
||||||
|
|
||||||
|
include_words = URLPatternFilter(patterns=["*News*", "*news*"])
|
||||||
|
deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
|
||||||
|
|
||||||
|
# --- SETUP LOGGING ---
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
|
def save_results_to_json(successful_data, failed_pages):
|
||||||
|
"""Saves the crawl results to timestamped JSON files in a new directory."""
|
||||||
|
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
# could timestamp this but im not cuz its easier to analyze.
|
||||||
|
# later we will prolly have one folder with all the timestamped files that we will go through regex'd
|
||||||
|
# for now well just overwrite.
|
||||||
|
output_dir = f"crawl_results"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
logging.info(f"Saving results to '{output_dir}' directory...")
|
||||||
|
|
||||||
|
successful_file = os.path.join(output_dir, "successful_pages.json")
|
||||||
|
with open(successful_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(successful_data, f, indent=4, ensure_ascii=False)
|
||||||
|
logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
|
||||||
|
|
||||||
|
if failed_pages:
|
||||||
|
failed_file = os.path.join(output_dir, "failed_pages.json")
|
||||||
|
with open(failed_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
|
||||||
|
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""
|
||||||
|
Main function to handle manual login, capture the session state from the
|
||||||
|
active tab, and then hand it off to the crawler.
|
||||||
|
"""
|
||||||
|
# --- STEP 1: Manual Login in a Temporary Browser ---
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=False)
|
||||||
|
context = await browser.new_context()
|
||||||
|
page = await context.new_page() # This is the initial page
|
||||||
|
|
||||||
|
logging.info("A browser window has opened. Please complete the following steps:")
|
||||||
|
logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
|
||||||
|
logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
|
||||||
|
await page.goto(LOGIN_URL)
|
||||||
|
|
||||||
|
input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
|
||||||
|
|
||||||
|
# MODIFIED: Instead of using the original 'page' object, get the current active tab.
|
||||||
|
# This correctly handles cases where the login process opens a new tab.
|
||||||
|
print("ALL PAGES:")
|
||||||
|
for page in context.pages:
|
||||||
|
print("URL: ", page.url)
|
||||||
|
active_page = context.pages[-1]
|
||||||
|
start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
|
||||||
|
|
||||||
|
logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
|
||||||
|
|
||||||
|
# Capture the full session state (cookies, localStorage, etc.)
|
||||||
|
storage_state = await context.storage_state()
|
||||||
|
|
||||||
|
# We no longer need this temporary browser.
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
# --- STEP 2: Configure and Run the Crawler with the Captured State ---
|
||||||
|
|
||||||
|
# Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=False,
|
||||||
|
storage_state=storage_state # This injects your logged-in session.
|
||||||
|
)
|
||||||
|
|
||||||
|
scorer = KeywordRelevanceScorer(
|
||||||
|
keywords=SCRAPER_KEYWORDS,
|
||||||
|
weight=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
filter = FilterChain([
|
||||||
|
# DebugFilter(),
|
||||||
|
include_words,
|
||||||
|
deny_words
|
||||||
|
])
|
||||||
|
|
||||||
|
# This configuration remains the same
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
|
||||||
|
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||||
|
max_depth=DEPTH,
|
||||||
|
max_pages=COUNT,
|
||||||
|
url_scorer=scorer,
|
||||||
|
filter_chain=filter
|
||||||
|
),
|
||||||
|
verbose=True,
|
||||||
|
stream=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="domcontentloaded"
|
||||||
|
)
|
||||||
|
|
||||||
|
successful_data = []
|
||||||
|
failed_pages = []
|
||||||
|
|
||||||
|
logging.info("Starting crawler with the captured session state...")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# The crawler will now begin at the correct URL you navigated to.
|
||||||
|
async for result in await crawler.arun(start_url, config=config):
|
||||||
|
if result.success:
|
||||||
|
all_links = [
|
||||||
|
l["href"]
|
||||||
|
for l in chain(result.links.get("internal", []), result.links.get("external", []))
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"✅ Scraped: {result.url}")
|
||||||
|
print("Filtered links:")
|
||||||
|
|
||||||
|
# Apply filters one URL at a time
|
||||||
|
for url in all_links:
|
||||||
|
if include_words.apply(url) and deny_words.apply(url):
|
||||||
|
print(" ->", url)
|
||||||
|
score = result.metadata.get("score", 0)
|
||||||
|
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
|
||||||
|
successful_data.append({
|
||||||
|
"url": result.url, "content": result.markdown,
|
||||||
|
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed: {result.url} - {result.error_message}")
|
||||||
|
failed_pages.append({'url': result.url, 'error': result.error_message})
|
||||||
|
|
||||||
|
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
|
||||||
|
save_results_to_json(successful_data, failed_pages)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,93 @@
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||||
|
# import database
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
PDF_QUEUE_FILE = "pdf_queue.txt"
|
||||||
|
COOKIES_FILE = "marketline_cookies.json"
|
||||||
|
IMAGE_OUTPUT_DIR = "./extracted_images"
|
||||||
|
CHECK_INTERVAL_SECONDS = 60
|
||||||
|
|
||||||
|
def load_cookies():
|
||||||
|
"""Loads cookies from the JSON file if it exists."""
|
||||||
|
if not os.path.exists(COOKIES_FILE):
|
||||||
|
print("Warning: cookies.json not found. Crawling without authentication.")
|
||||||
|
return None
|
||||||
|
with open(COOKIES_FILE, 'r') as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
return {c['name']: c['value'] for c in cookies}
|
||||||
|
|
||||||
|
async def process_pdf_queue(cookies):
|
||||||
|
"""
|
||||||
|
Processes all unique URLs found in the PDF queue file.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(PDF_QUEUE_FILE):
|
||||||
|
return
|
||||||
|
|
||||||
|
print("--- Checking PDF queue for new links ---")
|
||||||
|
with open(PDF_QUEUE_FILE, "r") as f:
|
||||||
|
urls_to_process = set(line.strip() for line in f if line.strip())
|
||||||
|
|
||||||
|
if not urls_to_process:
|
||||||
|
print("PDF queue is empty.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(urls_to_process)} PDF(s) to process.")
|
||||||
|
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
pdf_scraping_cfg = PDFContentScrapingStrategy(
|
||||||
|
extract_images=True,
|
||||||
|
save_images_locally=True,
|
||||||
|
image_save_dir=IMAGE_OUTPUT_DIR,
|
||||||
|
)
|
||||||
|
pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
|
||||||
|
for url in urls_to_process:
|
||||||
|
print(f"\nProcessing PDF: {url}")
|
||||||
|
try:
|
||||||
|
result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
|
||||||
|
if not result.success:
|
||||||
|
print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = result.markdown.raw_markdown if result.markdown else ""
|
||||||
|
print(f"PAGE CONTENT: {content}")
|
||||||
|
# page_id = database.add_crawled_page(result.url, content, 'pdf')
|
||||||
|
|
||||||
|
# if page_id and result.media and result.media.get("images"):
|
||||||
|
# print(f"Found {len(result.media['images'])} images in {result.url}")
|
||||||
|
# for img_info in result.media["images"]:
|
||||||
|
# database.add_crawled_image(
|
||||||
|
# page_id=page_id,
|
||||||
|
# page_number=img_info.get('page'),
|
||||||
|
# local_path=img_info.get('path'),
|
||||||
|
# img_format=img_info.get('format')
|
||||||
|
# )
|
||||||
|
print(f"Successfully processed and stored PDF: {result.url}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"A critical error occurred while processing PDF '{url}': {e}")
|
||||||
|
|
||||||
|
with open(PDF_QUEUE_FILE, "w") as f:
|
||||||
|
f.write("")
|
||||||
|
print("\n--- PDF queue processing finished ---")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point that runs the PDF processing loop."""
|
||||||
|
# database.setup_database()
|
||||||
|
print("PDF Processor service starting...")
|
||||||
|
cookies = load_cookies()
|
||||||
|
while True:
|
||||||
|
await process_pdf_queue(cookies)
|
||||||
|
print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
|
||||||
|
await asyncio.sleep(CHECK_INTERVAL_SECONDS)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nPDF Processor service stopped by user.")
|
|
@ -1,29 +1,110 @@
|
||||||
|
aiofiles==24.1.0
|
||||||
|
aiohappyeyeballs==2.6.1
|
||||||
|
aiohttp==3.12.15
|
||||||
|
aiosignal==1.4.0
|
||||||
|
aiosqlite==0.21.0
|
||||||
|
alphashape==1.3.1
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.10.0
|
||||||
|
attrs==25.3.0
|
||||||
beautifulsoup4==4.13.4
|
beautifulsoup4==4.13.4
|
||||||
|
Brotli==1.1.0
|
||||||
cachetools==5.5.2
|
cachetools==5.5.2
|
||||||
certifi==2025.7.14
|
certifi==2025.8.3
|
||||||
charset-normalizer==3.4.2
|
cffi==1.17.1
|
||||||
dotenv==0.9.9
|
chardet==5.2.0
|
||||||
|
charset-normalizer==3.4.3
|
||||||
|
click==8.2.1
|
||||||
|
click-log==0.4.0
|
||||||
|
Crawl4AI==0.7.4
|
||||||
|
cryptography==45.0.6
|
||||||
|
distro==1.9.0
|
||||||
|
fake-http-header==0.3.5
|
||||||
|
fake-useragent==2.2.0
|
||||||
|
filelock==3.19.1
|
||||||
|
frozenlist==1.7.0
|
||||||
|
fsspec==2025.7.0
|
||||||
google==3.0.0
|
google==3.0.0
|
||||||
google-ai-generativelanguage==0.1.0
|
google-ai-generativelanguage==0.6.15
|
||||||
google-api-core==2.25.1
|
google-api-core==2.25.1
|
||||||
google-api-python-client==2.177.0
|
google-api-python-client==2.179.0
|
||||||
google-auth==2.40.3
|
google-auth==2.40.3
|
||||||
google-auth-httplib2==0.2.0
|
google-auth-httplib2==0.2.0
|
||||||
google-generativeai==0.1.0rc1
|
google-generativeai==0.8.5
|
||||||
googleapis-common-protos==1.70.0
|
googleapis-common-protos==1.70.0
|
||||||
grpcio==1.70.0
|
greenlet==3.2.4
|
||||||
grpcio-status==1.62.3
|
grpcio==1.74.0
|
||||||
|
grpcio-status==1.71.2
|
||||||
|
h11==0.16.0
|
||||||
|
h2==4.2.0
|
||||||
|
hf-xet==1.1.8
|
||||||
|
hpack==4.1.0
|
||||||
|
httpcore==1.0.9
|
||||||
httplib2==0.22.0
|
httplib2==0.22.0
|
||||||
|
httpx==0.28.1
|
||||||
|
huggingface-hub==0.34.4
|
||||||
|
humanize==4.12.3
|
||||||
|
hyperframe==6.1.0
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
importlib_metadata==8.7.0
|
||||||
|
Jinja2==3.1.6
|
||||||
|
jiter==0.10.0
|
||||||
|
joblib==1.5.1
|
||||||
|
jsonschema==4.25.1
|
||||||
|
jsonschema-specifications==2025.4.1
|
||||||
|
lark==1.2.2
|
||||||
|
litellm==1.75.9
|
||||||
|
lxml==5.4.0
|
||||||
|
markdown-it-py==4.0.0
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
mdurl==0.1.2
|
||||||
|
multidict==6.6.4
|
||||||
|
networkx==3.5
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==2.3.2
|
||||||
|
openai==1.100.2
|
||||||
|
packaging==25.0
|
||||||
|
patchright==1.52.5
|
||||||
|
pillow==11.3.0
|
||||||
|
playwright==1.54.0
|
||||||
|
propcache==0.3.2
|
||||||
proto-plus==1.26.1
|
proto-plus==1.26.1
|
||||||
protobuf==4.25.8
|
protobuf==5.29.5
|
||||||
|
psutil==7.0.0
|
||||||
pyasn1==0.6.1
|
pyasn1==0.6.1
|
||||||
pyasn1-modules==0.4.2
|
pyasn1_modules==0.4.2
|
||||||
pyparsing==3.1.4
|
pycparser==2.22
|
||||||
python-dotenv==1.0.1
|
pydantic==2.11.7
|
||||||
requests==2.32.4
|
pydantic_core==2.33.2
|
||||||
|
pyee==13.0.0
|
||||||
|
Pygments==2.19.2
|
||||||
|
pyOpenSSL==25.1.0
|
||||||
|
pyparsing==3.2.3
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
python-dotenv==1.1.1
|
||||||
|
PyYAML==6.0.2
|
||||||
|
rank-bm25==0.2.2
|
||||||
|
referencing==0.36.2
|
||||||
|
regex==2025.7.34
|
||||||
|
requests==2.32.5
|
||||||
|
rich==14.1.0
|
||||||
|
rpds-py==0.27.0
|
||||||
rsa==4.9.1
|
rsa==4.9.1
|
||||||
|
rtree==1.4.1
|
||||||
|
scipy==1.16.1
|
||||||
|
shapely==2.1.1
|
||||||
|
sniffio==1.3.1
|
||||||
|
snowballstemmer==2.2.0
|
||||||
soupsieve==2.7
|
soupsieve==2.7
|
||||||
typing-extensions==4.13.2
|
tf-playwright-stealth==1.2.0
|
||||||
uritemplate==4.1.1
|
tiktoken==0.11.0
|
||||||
urllib3==2.2.3
|
tokenizers==0.21.4
|
||||||
|
tqdm==4.67.1
|
||||||
|
trimesh==4.7.4
|
||||||
|
typing-inspection==0.4.1
|
||||||
|
typing_extensions==4.14.1
|
||||||
|
uritemplate==4.2.0
|
||||||
|
urllib3==2.5.0
|
||||||
|
xxhash==3.5.0
|
||||||
|
yarl==1.20.1
|
||||||
|
zipp==3.23.0
|
||||||
|
|
Loading…
Reference in New Issue