scraper handoff + updated reqs

2025-09-03 19:31:14 -04:00 · 2025-09-03 19:31:14 -04:00 · b1f3115999
parent 725f028d69
commit b1f3115999
8 changed files with 443 additions and 139 deletions
--- a/docker/crawler/.gitignore
+++ b/docker/crawler/.gitignore
@ -1,2 +1,4 @@
 .env
-marketline_cookies.json
+marketline_cookies.json
 venv/
 marketline_session/
--- a/docker/crawler/analyze.py
+++ b/docker/crawler/analyze.py
@ -240,8 +240,8 @@ def main():
            # basic required-field check (we want the API-required fields present)
            if not is_valid_transaction(tx):
-                print("   ⚠️ Skipping — missing required API fields in extracted transaction:", tx)
+                print("   ⚠️ missing required API fields in extracted transaction:", tx)
-                continue
+                #continue
            # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
            # Save the item
--- a/docker/crawler/crawl_results/.gitignore
+++ b/docker/crawler/crawl_results/.gitignore
@ -1 +1 @@
-*.json
+*.json
--- a/docker/crawler/marketline_crawler.py
+++ b/docker/crawler/marketline_crawler.py
@ -1,186 +1,138 @@
 # NOT USED CURRENTLY
 # temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
 # the handoff is very similar anyways
 import asyncio
-from playwright.async_api import async_playwright, Page
+from playwright.async_api import async_playwright
 import json
 import os
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
-from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 from crawl4ai.processors.pdf import PDFContentScrapingStrategy
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.deep_crawling.filters import URLPatternFilter
 from datetime import datetime
 import logging
 import time
 # --- CONFIGURATION ---
-
+LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
-# TODO: this will need to change for different organizations (ie univiersities)
+LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
 # make this the link for university login when accessing marketline
 LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
 # shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. 
 HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
-
+CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
 # the root page to seed crawling
 CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
 # trying out another page
 # CRAWLPAGE_URL = "https://www.defensenews.com/"
 # name of file where cookies are saved
 COOKIES_FILE = "marketline_cookies.json"
 # --- CRAWLER SETTINGS ---
-DEPTH = 3
+DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
-COUNT = 100
+COUNT = 50
-
+# UPDATED: Expanded keywords to better score article pages
 # TODO: maybe make this list more comprehensive? 
 SCRAPER_KEYWORDS = [
-    # Core Terms
+    "arms", "weapons", "military", "defence", "defense", "aerospace",
-    "arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
+    "canadian armed forces", "caf", "dnd", "global affairs canada",
-    "military export", "defence contract", "defense contract",
+    "export", "sale", "contract", "procurement", "acquisition",
-    
+    "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
-    # Canadian Context
+    "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
-    "canadian armed forces", "global affairs canada", "canadian defence", 
+    "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
    "canadian military", "royal canadian navy", "royal canadian air force",
    # Equipment & Technology
    "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", 
    "frigate", "fighter jet", "military aircraft", "surveillance", "radar", 
    "artillery", "munitions", "firearms", "aerospace",
    # Action & Policy Terms
    "procurement", "acquisition", "military aid", "export permit", "itar"
 ]
-# runs login process and saves cookies so that we can run the scraping with authentication
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 async def login_and_save_cookies():
 async def login_and_save_cookies():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()
        try:
            logging.info("Starting login process... Please complete login in the browser.")
            await page.goto(LOGIN_URL)
-            await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
+            time.sleep(45)
-
+            # await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
-            print("Login detected. Saving session cookies...")
+            logging.info("Login successful. Saving session cookies...")
            cookies = await context.cookies()
            with open(COOKIES_FILE, "w") as f:
-                json.dump(cookies, f)
+                json.dump(cookies, f, indent=2)
-
+            logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
            print("Cookies saved successfully!")
            await crawl_with_saved_cookies()
        except Exception as e:
-            print(f"Login failed: {e}")
+            logging.error(f"Login failed: {e}")
            print("Error details:")
            print(await page.content())
        finally:
            await context.close()
            await browser.close()
 def save_results_to_json(successful_data, failed_pages):
-    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    Saves the successful and failed crawl results into separate JSON files
+    output_dir = f"crawl_results_{timestamp}"
    in a dedicated directory.
    """
    output_dir = "crawl_results"
    os.makedirs(output_dir, exist_ok=True)
-    print(f"\n💾 Saving results to '{output_dir}' directory...")
+    logging.info(f"Saving results to '{output_dir}' directory...")
    # Define file paths
    successful_file = os.path.join(output_dir, "successful_pages.json")
    failed_file = os.path.join(output_dir, "failed_pages.json")
    # Save successfully scraped data
    with open(successful_file, "w", encoding="utf-8") as f:
        json.dump(successful_data, f, indent=4, ensure_ascii=False)
-    print(f"   Saved data for {len(successful_data)} successful pages to '{successful_file}'")
+    logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")
    # Save failed pages if any
    if failed_pages:
        failed_file = os.path.join(output_dir, "failed_pages.json")
        with open(failed_file, "w", encoding="utf-8") as f:
            json.dump(failed_pages, f, indent=4, ensure_ascii=False)
-        print(f"   Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
+        logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
 # runs the crawler with the cookies collected during login
 async def crawl_with_saved_cookies():
    if not os.path.exists(COOKIES_FILE):
-        print("No cookies found. Please run login first.")
+        logging.warning("No cookies found. Running login first...")
-        return
+        await login_and_save_cookies()
        if not os.path.exists(COOKIES_FILE):
            logging.error("Login failed or was aborted. Exiting.")
            return
    with open(COOKIES_FILE, "r") as f:
-        cookies = json.load(f)
+        try:
            cookies = json.load(f)
        except json.JSONDecodeError:
            logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
            return
-    browser_config = BrowserConfig(cookies=cookies)
+    logging.info(f"Loaded {len(cookies)} cookies for crawling.")
    browser_config = BrowserConfig(cookies=cookies, headless=False)
    # NEW: Define a filter to only follow links that are news articles
    article_filter = URLPatternFilter(patterns=[r"/news/"])
    config = CrawlerRunConfig(
-        deep_crawl_strategy=BFSDeepCrawlStrategy(
+        
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=DEPTH,
            max_pages=COUNT,
-            url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
+            #url_scorer=(),
            # url_filters=[article_filter] # UPDATED: Add the filter to the strategy
        ),
-        scraping_strategy=LXMLWebScrapingStrategy(),
+        # scraping_strategy=
-        # TODO: scrape the PDFs better
+        # LXMLWebScrapingStrategy(),
-        # scraping_strategy=PDFCrawlerStrategy(),
+        verbose=True, stream=True, page_timeout=120000,
-        verbose=True,
+        wait_until="domcontentloaded"
        stream=True,
        page_timeout=30000
    )
-    
+
    successful_data = []
    failed_pages = []
-    
+
    logging.info("Starting crawl...")
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # We start with the list page. The filter will ensure we only crawl article links from it.
        async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
            if result.success:
-                depth = result.metadata.get("depth", 0)
+                print("RESIULT:", result)
                score = result.metadata.get("score", 0)
-                
+                print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
                # here we could look at a few things, the HTML, markdown, raw text, etc. 
                scraped_content = result.markdown
                print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
                # NEW: Print a preview of the content to confirm it's being scraped
                print(f"   📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
                successful_data.append({
-                    "url": result.url,
+                    "url": result.url, "content": result.markdown,
-                    "content": scraped_content,
+                    "depth": result.metadata.get("depth", 0), "score": round(score, 2),
-                    "depth": depth,
+                    "timestamp": datetime.now().isoformat()
                    "score": round(score, 2)
                })
            else:
                failed_pages.append({
                    'url': result.url,
                    'error': result.error_message,
                    'depth': result.metadata.get("depth", 0)
                })
                print(f"❌ Failed: {result.url} - {result.error_message}")
-    
+                failed_pages.append({'url': result.url, 'error': result.error_message})
    print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
    logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
    save_results_to_json(successful_data, failed_pages)
    # Analyze failures by depth
    if failed_pages:
        failure_by_depth = {}
        for failure in failed_pages:
            depth = failure['depth']
            failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
        print("❌ Failures by depth:")
        for depth, count in sorted(failure_by_depth.items()):
            print(f"   Depth {depth}: {count} failures")
 if __name__ == "__main__":
    # Choose which function to run
    # 1. First, run the login function once to get your cookies
    # asyncio.run(login_and_save_cookies())
    # 2. Then, comment out the login line and run the crawl
    asyncio.run(crawl_with_saved_cookies())
--- a/docker/crawler/marketline_handoff.py
+++ b/docker/crawler/marketline_handoff.py
@ -0,0 +1,176 @@
 # like the crawler but with a session hand off instead of a cookies sharing approach
 # opens non-headless browser, user logs in and does captchas and then hands off to scraper
 # more reliable, easier to debug and captcha resistant
 import asyncio
 from itertools import chain
 from playwright.async_api import async_playwright
 import json
 import os
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
 from crawl4ai.deep_crawling.filters import URLPatternFilter
 from datetime import datetime
 import logging
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
 # --- CONFIGURATION ---
 # MODIFIED: Only the login URL is needed for the initial navigation.
 # The user will navigate to the crawl starting page manually.
 LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
 # --- CRAWLER SETTINGS ---
 DEPTH = 2
 COUNT = 50
 SCRAPER_KEYWORDS = [
    "arms", "weapons", "military", "defence", "defense", "aerospace",
    "canadian armed forces", "caf", "dnd", "global affairs canada",
    "export", "sale", "contract", "procurement", "acquisition",
    "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
    "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
    "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
 ]
 # class DebugFilter(BaseFilter):
 #     def apply(self, urls):
 #         print("\n=== LINKS BEFORE FILTERING ===")
 #         for u in urls:
 #             print(u)
 #         return urls  # don’t drop anything
 include_words = URLPatternFilter(patterns=["*News*", "*news*"])
 deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
 # --- SETUP LOGGING ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def save_results_to_json(successful_data, failed_pages):
    """Saves the crawl results to timestamped JSON files in a new directory."""
    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # could timestamp this but im not cuz its easier to analyze. 
    # later we will prolly have one folder with all the timestamped files that we will go through regex'd 
    # for now well just overwrite. 
    output_dir = f"crawl_results"
    os.makedirs(output_dir, exist_ok=True)
    logging.info(f"Saving results to '{output_dir}' directory...")
    successful_file = os.path.join(output_dir, "successful_pages.json")
    with open(successful_file, "w", encoding="utf-8") as f:
        json.dump(successful_data, f, indent=4, ensure_ascii=False)
    logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
    if failed_pages:
        failed_file = os.path.join(output_dir, "failed_pages.json")
        with open(failed_file, "w", encoding="utf-8") as f:
            json.dump(failed_pages, f, indent=4, ensure_ascii=False)
        logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
 async def main():
    """
    Main function to handle manual login, capture the session state from the
    active tab, and then hand it off to the crawler.
    """
    # --- STEP 1: Manual Login in a Temporary Browser ---
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page() # This is the initial page
        logging.info("A browser window has opened. Please complete the following steps:")
        logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
        logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
        await page.goto(LOGIN_URL)
        input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
        # MODIFIED: Instead of using the original 'page' object, get the current active tab.
        # This correctly handles cases where the login process opens a new tab.
        print("ALL PAGES:")
        for page in context.pages:
            print("URL: ", page.url)
        active_page = context.pages[-1]
        start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
        logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
        # Capture the full session state (cookies, localStorage, etc.)
        storage_state = await context.storage_state()
        # We no longer need this temporary browser.
        await browser.close()
    # --- STEP 2: Configure and Run the Crawler with the Captured State ---
    # Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
    browser_config = BrowserConfig(
        headless=False,
        storage_state=storage_state  # This injects your logged-in session.
    )
    scorer = KeywordRelevanceScorer(
        keywords=SCRAPER_KEYWORDS,
        weight=0.7
    )
    filter = FilterChain([
        # DebugFilter(),
        include_words,
        deny_words
    ])
    # This configuration remains the same
    config = CrawlerRunConfig(
        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=DEPTH,
            max_pages=COUNT,
            url_scorer=scorer,
            filter_chain=filter
        ),
        verbose=True,
        stream=True,
        page_timeout=120000,
        wait_until="domcontentloaded"
    )
    successful_data = []
    failed_pages = []
    logging.info("Starting crawler with the captured session state...")
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # The crawler will now begin at the correct URL you navigated to.
        async for result in await crawler.arun(start_url, config=config):
            if result.success:
                all_links = [
                    l["href"]
                    for l in chain(result.links.get("internal", []), result.links.get("external", []))
                ]
                print(f"✅ Scraped: {result.url}")
                print("Filtered links:")
                # Apply filters one URL at a time
                for url in all_links:
                    if include_words.apply(url) and deny_words.apply(url):
                        print("   ->", url)
                score = result.metadata.get("score", 0)
                print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
                successful_data.append({
                    "url": result.url, "content": result.markdown,
                    "depth": result.metadata.get("depth", 0), "score": round(score, 2),
                    "timestamp": datetime.now().isoformat()
                })
            else:
                print(f"❌ Failed: {result.url} - {result.error_message}")
                failed_pages.append({'url': result.url, 'error': result.error_message})
    logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
    save_results_to_json(successful_data, failed_pages)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docker/crawler/pdf_crawler.py
+++ b/docker/crawler/pdf_crawler.py
@ -0,0 +1,93 @@
 import asyncio
 import os
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
 # import database
 # --- Configuration ---
 PDF_QUEUE_FILE = "pdf_queue.txt"
 COOKIES_FILE = "marketline_cookies.json"
 IMAGE_OUTPUT_DIR = "./extracted_images"
 CHECK_INTERVAL_SECONDS = 60
 def load_cookies():
    """Loads cookies from the JSON file if it exists."""
    if not os.path.exists(COOKIES_FILE):
        print("Warning: cookies.json not found. Crawling without authentication.")
        return None
    with open(COOKIES_FILE, 'r') as f:
        cookies = json.load(f)
    return {c['name']: c['value'] for c in cookies}
 async def process_pdf_queue(cookies):
    """
    Processes all unique URLs found in the PDF queue file.
    """
    if not os.path.exists(PDF_QUEUE_FILE):
        return
    print("--- Checking PDF queue for new links ---")
    with open(PDF_QUEUE_FILE, "r") as f:
        urls_to_process = set(line.strip() for line in f if line.strip())
    if not urls_to_process:
        print("PDF queue is empty.")
        return
    print(f"Found {len(urls_to_process)} PDF(s) to process.")
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
    pdf_scraping_cfg = PDFContentScrapingStrategy(
        extract_images=True,
        save_images_locally=True,
        image_save_dir=IMAGE_OUTPUT_DIR,
    )
    pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
        for url in urls_to_process:
            print(f"\nProcessing PDF: {url}")
            try:
                result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
                if not result.success:
                    print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
                    continue
                content = result.markdown.raw_markdown if result.markdown else ""
                print(f"PAGE CONTENT: {content}")
                # page_id = database.add_crawled_page(result.url, content, 'pdf')
                # if page_id and result.media and result.media.get("images"):
                #     print(f"Found {len(result.media['images'])} images in {result.url}")
                    # for img_info in result.media["images"]:
                        # database.add_crawled_image(
                        #     page_id=page_id,
                        #     page_number=img_info.get('page'),
                        #     local_path=img_info.get('path'),
                        #     img_format=img_info.get('format')
                        # )
                print(f"Successfully processed and stored PDF: {result.url}")
            except Exception as e:
                print(f"A critical error occurred while processing PDF '{url}': {e}")
    with open(PDF_QUEUE_FILE, "w") as f:
        f.write("")
    print("\n--- PDF queue processing finished ---")
 async def main():
    """Main entry point that runs the PDF processing loop."""
    # database.setup_database()
    print("PDF Processor service starting...")
    cookies = load_cookies()
    while True:
        await process_pdf_queue(cookies)
        print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
        await asyncio.sleep(CHECK_INTERVAL_SECONDS)
 if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("\nPDF Processor service stopped by user.")
--- a/docker/crawler/pdf_queue.txt
+++ b/docker/crawler/pdf_queue.txt
--- a/docker/crawler/requirements.txt
+++ b/docker/crawler/requirements.txt
@ -1,29 +1,110 @@
 aiofiles==24.1.0
 aiohappyeyeballs==2.6.1
 aiohttp==3.12.15
 aiosignal==1.4.0
 aiosqlite==0.21.0
 alphashape==1.3.1
 annotated-types==0.7.0
 anyio==4.10.0
 attrs==25.3.0
 beautifulsoup4==4.13.4
 Brotli==1.1.0
 cachetools==5.5.2
-certifi==2025.7.14
+certifi==2025.8.3
-charset-normalizer==3.4.2
+cffi==1.17.1
-dotenv==0.9.9
+chardet==5.2.0
 charset-normalizer==3.4.3
 click==8.2.1
 click-log==0.4.0
 Crawl4AI==0.7.4
 cryptography==45.0.6
 distro==1.9.0
 fake-http-header==0.3.5
 fake-useragent==2.2.0
 filelock==3.19.1
 frozenlist==1.7.0
 fsspec==2025.7.0
 google==3.0.0
-google-ai-generativelanguage==0.1.0
+google-ai-generativelanguage==0.6.15
 google-api-core==2.25.1
-google-api-python-client==2.177.0
+google-api-python-client==2.179.0
 google-auth==2.40.3
 google-auth-httplib2==0.2.0
-google-generativeai==0.1.0rc1
+google-generativeai==0.8.5
 googleapis-common-protos==1.70.0
-grpcio==1.70.0
+greenlet==3.2.4
-grpcio-status==1.62.3
+grpcio==1.74.0
 grpcio-status==1.71.2
 h11==0.16.0
 h2==4.2.0
 hf-xet==1.1.8
 hpack==4.1.0
 httpcore==1.0.9
 httplib2==0.22.0
 httpx==0.28.1
 huggingface-hub==0.34.4
 humanize==4.12.3
 hyperframe==6.1.0
 idna==3.10
 importlib_metadata==8.7.0
 Jinja2==3.1.6
 jiter==0.10.0
 joblib==1.5.1
 jsonschema==4.25.1
 jsonschema-specifications==2025.4.1
 lark==1.2.2
 litellm==1.75.9
 lxml==5.4.0
 markdown-it-py==4.0.0
 MarkupSafe==3.0.2
 mdurl==0.1.2
 multidict==6.6.4
 networkx==3.5
 nltk==3.9.1
 numpy==2.3.2
 openai==1.100.2
 packaging==25.0
 patchright==1.52.5
 pillow==11.3.0
 playwright==1.54.0
 propcache==0.3.2
 proto-plus==1.26.1
-protobuf==4.25.8
+protobuf==5.29.5
 psutil==7.0.0
 pyasn1==0.6.1
-pyasn1-modules==0.4.2
+pyasn1_modules==0.4.2
-pyparsing==3.1.4
+pycparser==2.22
-python-dotenv==1.0.1
+pydantic==2.11.7
-requests==2.32.4
+pydantic_core==2.33.2
 pyee==13.0.0
 Pygments==2.19.2
 pyOpenSSL==25.1.0
 pyparsing==3.2.3
 PyPDF2==3.0.1
 python-dotenv==1.1.1
 PyYAML==6.0.2
 rank-bm25==0.2.2
 referencing==0.36.2
 regex==2025.7.34
 requests==2.32.5
 rich==14.1.0
 rpds-py==0.27.0
 rsa==4.9.1
 rtree==1.4.1
 scipy==1.16.1
 shapely==2.1.1
 sniffio==1.3.1
 snowballstemmer==2.2.0
 soupsieve==2.7
-typing-extensions==4.13.2
+tf-playwright-stealth==1.2.0
-uritemplate==4.1.1
+tiktoken==0.11.0
-urllib3==2.2.3
+tokenizers==0.21.4
 tqdm==4.67.1
 trimesh==4.7.4
 typing-inspection==0.4.1
 typing_extensions==4.14.1
 uritemplate==4.2.0
 urllib3==2.5.0
 xxhash==3.5.0
 yarl==1.20.1
 zipp==3.23.0