scraper handoff + updated reqs

2025-09-03 19:31:14 -04:00 · 2025-09-03 19:31:14 -04:00 · b1f3115999
parent 725f028d69
commit b1f3115999
8 changed files with 443 additions and 139 deletions
--- a/docker/crawler/.gitignore
+++ b/docker/crawler/.gitignore
@ -1,2 +1,4 @@
 .env
-marketline_cookies.json
+marketline_cookies.json
+venv/
+marketline_session/
--- a/docker/crawler/analyze.py
+++ b/docker/crawler/analyze.py
@ -240,8 +240,8 @@ def main():

            # basic required-field check (we want the API-required fields present)
            if not is_valid_transaction(tx):
-                print("   ⚠️ Skipping — missing required API fields in extracted transaction:", tx)
-                continue
+                print("   ⚠️ missing required API fields in extracted transaction:", tx)
+                #continue

            # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
            # Save the item
--- a/docker/crawler/crawl_results/.gitignore
+++ b/docker/crawler/crawl_results/.gitignore
@ -1 +1 @@
-*.json
+*.json
--- a/docker/crawler/marketline_crawler.py
+++ b/docker/crawler/marketline_crawler.py
@ -1,186 +1,138 @@
+# NOT USED CURRENTLY
+# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
+# the handoff is very similar anyways
+
 import asyncio
-from playwright.async_api import async_playwright, Page
+from playwright.async_api import async_playwright
 import json
 import os
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
-from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
-from crawl4ai.processors.pdf import PDFContentScrapingStrategy
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.deep_crawling.filters import URLPatternFilter
 from datetime import datetime
+import logging
+import time

 # --- CONFIGURATION ---
-
-# TODO: this will need to change for different organizations (ie univiersities)
-
-# make this the link for university login when accessing marketline
-LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
-
-# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. 
+LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
+LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
 HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
-
-# the root page to seed crawling
-CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
-# trying out another page
-# CRAWLPAGE_URL = "https://www.defensenews.com/"
-
-
-# name of file where cookies are saved
+CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
 COOKIES_FILE = "marketline_cookies.json"

 # --- CRAWLER SETTINGS ---
-DEPTH = 3
-COUNT = 100
-
-# TODO: maybe make this list more comprehensive? 
+DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
+COUNT = 50
+# UPDATED: Expanded keywords to better score article pages
 SCRAPER_KEYWORDS = [
-    # Core Terms
-    "arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
-    "military export", "defence contract", "defense contract",
-    
-    # Canadian Context
-    "canadian armed forces", "global affairs canada", "canadian defence", 
-    "canadian military", "royal canadian navy", "royal canadian air force",
-    
-    # Equipment & Technology
-    "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", 
-    "frigate", "fighter jet", "military aircraft", "surveillance", "radar", 
-    "artillery", "munitions", "firearms", "aerospace",
-    
-    # Action & Policy Terms
-    "procurement", "acquisition", "military aid", "export permit", "itar"
+    "arms", "weapons", "military", "defence", "defense", "aerospace",
+    "canadian armed forces", "caf", "dnd", "global affairs canada",
+    "export", "sale", "contract", "procurement", "acquisition",
+    "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
+    "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
+    "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
 ]

-# runs login process and saves cookies so that we can run the scraping with authentication
-async def login_and_save_cookies():
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

+async def login_and_save_cookies():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()
-
        try:
+            logging.info("Starting login process... Please complete login in the browser.")
            await page.goto(LOGIN_URL)
-            await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
-
-            print("Login detected. Saving session cookies...")
+            time.sleep(45)
+            # await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
+            logging.info("Login successful. Saving session cookies...")
            cookies = await context.cookies()
            with open(COOKIES_FILE, "w") as f:
-                json.dump(cookies, f)
-
-            print("Cookies saved successfully!")
-            await crawl_with_saved_cookies()
-
+                json.dump(cookies, f, indent=2)
+            logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
        except Exception as e:
-            print(f"Login failed: {e}")
-            print("Error details:")
-            print(await page.content())
-
+            logging.error(f"Login failed: {e}")
        finally:
            await context.close()
            await browser.close()

 def save_results_to_json(successful_data, failed_pages):
-    """
-    Saves the successful and failed crawl results into separate JSON files
-    in a dedicated directory.
-    """
-    output_dir = "crawl_results"
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = f"crawl_results_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
-    print(f"\n💾 Saving results to '{output_dir}' directory...")
+    logging.info(f"Saving results to '{output_dir}' directory...")

-    # Define file paths
    successful_file = os.path.join(output_dir, "successful_pages.json")
-    failed_file = os.path.join(output_dir, "failed_pages.json")
-
-    # Save successfully scraped data
    with open(successful_file, "w", encoding="utf-8") as f:
        json.dump(successful_data, f, indent=4, ensure_ascii=False)
-    print(f"   Saved data for {len(successful_data)} successful pages to '{successful_file}'")
+    logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")

-    # Save failed pages if any
    if failed_pages:
+        failed_file = os.path.join(output_dir, "failed_pages.json")
        with open(failed_file, "w", encoding="utf-8") as f:
            json.dump(failed_pages, f, indent=4, ensure_ascii=False)
-        print(f"   Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
+        logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")

-
-# runs the crawler with the cookies collected during login
 async def crawl_with_saved_cookies():
-
    if not os.path.exists(COOKIES_FILE):
-        print("No cookies found. Please run login first.")
-        return
+        logging.warning("No cookies found. Running login first...")
+        await login_and_save_cookies()
+        if not os.path.exists(COOKIES_FILE):
+            logging.error("Login failed or was aborted. Exiting.")
+            return

    with open(COOKIES_FILE, "r") as f:
-        cookies = json.load(f)
+        try:
+            cookies = json.load(f)
+        except json.JSONDecodeError:
+            logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
+            return

-    browser_config = BrowserConfig(cookies=cookies)
+    logging.info(f"Loaded {len(cookies)} cookies for crawling.")
+    browser_config = BrowserConfig(cookies=cookies, headless=False)
+    
+    # NEW: Define a filter to only follow links that are news articles
+    article_filter = URLPatternFilter(patterns=[r"/news/"])

    config = CrawlerRunConfig(
-        deep_crawl_strategy=BFSDeepCrawlStrategy(
+        
+        
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
            max_depth=DEPTH,
            max_pages=COUNT,
-            url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
+            #url_scorer=(),
+            # url_filters=[article_filter] # UPDATED: Add the filter to the strategy
        ),
-        scraping_strategy=LXMLWebScrapingStrategy(),
-        # TODO: scrape the PDFs better
-        # scraping_strategy=PDFCrawlerStrategy(),
-        verbose=True,
-        stream=True,
-        page_timeout=30000
+        # scraping_strategy=
+        # LXMLWebScrapingStrategy(),
+        verbose=True, stream=True, page_timeout=120000,
+        wait_until="domcontentloaded"
    )
-    
+
    successful_data = []
    failed_pages = []
-    
+
+    logging.info("Starting crawl...")
    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # We start with the list page. The filter will ensure we only crawl article links from it.
        async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
            if result.success:
-                depth = result.metadata.get("depth", 0)
+                print("RESIULT:", result)
                score = result.metadata.get("score", 0)
-                
-                # here we could look at a few things, the HTML, markdown, raw text, etc. 
-                scraped_content = result.markdown
-                
-                print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
-                # NEW: Print a preview of the content to confirm it's being scraped
-                print(f"   📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
-
+                print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
                successful_data.append({
-                    "url": result.url,
-                    "content": scraped_content,
-                    "depth": depth,
-                    "score": round(score, 2)
+                    "url": result.url, "content": result.markdown,
+                    "depth": result.metadata.get("depth", 0), "score": round(score, 2),
+                    "timestamp": datetime.now().isoformat()
                })
            else:
-                failed_pages.append({
-                    'url': result.url,
-                    'error': result.error_message,
-                    'depth': result.metadata.get("depth", 0)
-                })
                print(f"❌ Failed: {result.url} - {result.error_message}")
-    
-    print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
+                failed_pages.append({'url': result.url, 'error': result.error_message})

+    logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
    save_results_to_json(successful_data, failed_pages)
-    
-    # Analyze failures by depth
-    if failed_pages:
-        failure_by_depth = {}
-        for failure in failed_pages:
-            depth = failure['depth']
-            failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
-        
-        print("❌ Failures by depth:")
-        for depth, count in sorted(failure_by_depth.items()):
-            print(f"   Depth {depth}: {count} failures")

 if __name__ == "__main__":
-    # Choose which function to run
-    # 1. First, run the login function once to get your cookies
-    # asyncio.run(login_and_save_cookies())
-    
-    # 2. Then, comment out the login line and run the crawl
    asyncio.run(crawl_with_saved_cookies())
--- a/docker/crawler/marketline_handoff.py
+++ b/docker/crawler/marketline_handoff.py
@ -0,0 +1,176 @@
+# like the crawler but with a session hand off instead of a cookies sharing approach
+# opens non-headless browser, user logs in and does captchas and then hands off to scraper
+
+# more reliable, easier to debug and captcha resistant
+
+import asyncio
+from itertools import chain
+from playwright.async_api import async_playwright
+import json
+import os
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import URLPatternFilter
+from datetime import datetime
+import logging
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
+
+# --- CONFIGURATION ---
+# MODIFIED: Only the login URL is needed for the initial navigation.
+# The user will navigate to the crawl starting page manually.
+LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
+
+# --- CRAWLER SETTINGS ---
+DEPTH = 2
+COUNT = 50
+SCRAPER_KEYWORDS = [
+    "arms", "weapons", "military", "defence", "defense", "aerospace",
+    "canadian armed forces", "caf", "dnd", "global affairs canada",
+    "export", "sale", "contract", "procurement", "acquisition",
+    "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
+    "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
+    "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
+]
+
+# class DebugFilter(BaseFilter):
+#     def apply(self, urls):
+#         print("\n=== LINKS BEFORE FILTERING ===")
+#         for u in urls:
+#             print(u)
+#         return urls  # don’t drop anything
+
+include_words = URLPatternFilter(patterns=["*News*", "*news*"])
+deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
+
+# --- SETUP LOGGING ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def save_results_to_json(successful_data, failed_pages):
+    """Saves the crawl results to timestamped JSON files in a new directory."""
+    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    # could timestamp this but im not cuz its easier to analyze. 
+    # later we will prolly have one folder with all the timestamped files that we will go through regex'd 
+    # for now well just overwrite. 
+    output_dir = f"crawl_results"
+    os.makedirs(output_dir, exist_ok=True)
+    logging.info(f"Saving results to '{output_dir}' directory...")
+
+    successful_file = os.path.join(output_dir, "successful_pages.json")
+    with open(successful_file, "w", encoding="utf-8") as f:
+        json.dump(successful_data, f, indent=4, ensure_ascii=False)
+    logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
+
+    if failed_pages:
+        failed_file = os.path.join(output_dir, "failed_pages.json")
+        with open(failed_file, "w", encoding="utf-8") as f:
+            json.dump(failed_pages, f, indent=4, ensure_ascii=False)
+        logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
+
+async def main():
+    """
+    Main function to handle manual login, capture the session state from the
+    active tab, and then hand it off to the crawler.
+    """
+    # --- STEP 1: Manual Login in a Temporary Browser ---
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context()
+        page = await context.new_page() # This is the initial page
+
+        logging.info("A browser window has opened. Please complete the following steps:")
+        logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
+        logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
+        await page.goto(LOGIN_URL)
+        
+        input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
+
+        # MODIFIED: Instead of using the original 'page' object, get the current active tab.
+        # This correctly handles cases where the login process opens a new tab.
+        print("ALL PAGES:")
+        for page in context.pages:
+            print("URL: ", page.url)
+        active_page = context.pages[-1]
+        start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
+        
+        logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
+
+        # Capture the full session state (cookies, localStorage, etc.)
+        storage_state = await context.storage_state()
+        
+        # We no longer need this temporary browser.
+        await browser.close()
+
+    # --- STEP 2: Configure and Run the Crawler with the Captured State ---
+    
+    # Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
+    browser_config = BrowserConfig(
+        headless=False,
+        storage_state=storage_state  # This injects your logged-in session.
+    )
+
+    scorer = KeywordRelevanceScorer(
+        keywords=SCRAPER_KEYWORDS,
+        weight=0.7
+    )
+
+    filter = FilterChain([
+        # DebugFilter(),
+        include_words,
+        deny_words
+    ])
+    
+    # This configuration remains the same
+    config = CrawlerRunConfig(
+
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=DEPTH,
+            max_pages=COUNT,
+            url_scorer=scorer,
+            filter_chain=filter
+        ),
+        verbose=True,
+        stream=True,
+        page_timeout=120000,
+        wait_until="domcontentloaded"
+    )
+
+    successful_data = []
+    failed_pages = []
+
+    logging.info("Starting crawler with the captured session state...")
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # The crawler will now begin at the correct URL you navigated to.
+        async for result in await crawler.arun(start_url, config=config):
+            if result.success:
+                all_links = [
+                    l["href"]
+                    for l in chain(result.links.get("internal", []), result.links.get("external", []))
+                ]
+
+                print(f"✅ Scraped: {result.url}")
+                print("Filtered links:")
+
+                # Apply filters one URL at a time
+                for url in all_links:
+                    if include_words.apply(url) and deny_words.apply(url):
+                        print("   ->", url)
+                score = result.metadata.get("score", 0)
+                print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
+                successful_data.append({
+                    "url": result.url, "content": result.markdown,
+                    "depth": result.metadata.get("depth", 0), "score": round(score, 2),
+                    "timestamp": datetime.now().isoformat()
+                })
+            else:
+                print(f"❌ Failed: {result.url} - {result.error_message}")
+                failed_pages.append({'url': result.url, 'error': result.error_message})
+
+    logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
+    save_results_to_json(successful_data, failed_pages)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docker/crawler/pdf_crawler.py
+++ b/docker/crawler/pdf_crawler.py
@ -0,0 +1,93 @@
+import asyncio
+import os
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+# import database
+
+# --- Configuration ---
+PDF_QUEUE_FILE = "pdf_queue.txt"
+COOKIES_FILE = "marketline_cookies.json"
+IMAGE_OUTPUT_DIR = "./extracted_images"
+CHECK_INTERVAL_SECONDS = 60
+
+def load_cookies():
+    """Loads cookies from the JSON file if it exists."""
+    if not os.path.exists(COOKIES_FILE):
+        print("Warning: cookies.json not found. Crawling without authentication.")
+        return None
+    with open(COOKIES_FILE, 'r') as f:
+        cookies = json.load(f)
+    return {c['name']: c['value'] for c in cookies}
+
+async def process_pdf_queue(cookies):
+    """
+    Processes all unique URLs found in the PDF queue file.
+    """
+    if not os.path.exists(PDF_QUEUE_FILE):
+        return
+
+    print("--- Checking PDF queue for new links ---")
+    with open(PDF_QUEUE_FILE, "r") as f:
+        urls_to_process = set(line.strip() for line in f if line.strip())
+
+    if not urls_to_process:
+        print("PDF queue is empty.")
+        return
+
+    print(f"Found {len(urls_to_process)} PDF(s) to process.")
+    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
+
+    pdf_scraping_cfg = PDFContentScrapingStrategy(
+        extract_images=True,
+        save_images_locally=True,
+        image_save_dir=IMAGE_OUTPUT_DIR,
+    )
+    pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
+
+    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
+        for url in urls_to_process:
+            print(f"\nProcessing PDF: {url}")
+            try:
+                result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
+                if not result.success:
+                    print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
+                    continue
+
+                content = result.markdown.raw_markdown if result.markdown else ""
+                print(f"PAGE CONTENT: {content}")
+                # page_id = database.add_crawled_page(result.url, content, 'pdf')
+
+                # if page_id and result.media and result.media.get("images"):
+                #     print(f"Found {len(result.media['images'])} images in {result.url}")
+                    # for img_info in result.media["images"]:
+                        # database.add_crawled_image(
+                        #     page_id=page_id,
+                        #     page_number=img_info.get('page'),
+                        #     local_path=img_info.get('path'),
+                        #     img_format=img_info.get('format')
+                        # )
+                print(f"Successfully processed and stored PDF: {result.url}")
+
+            except Exception as e:
+                print(f"A critical error occurred while processing PDF '{url}': {e}")
+
+    with open(PDF_QUEUE_FILE, "w") as f:
+        f.write("")
+    print("\n--- PDF queue processing finished ---")
+
+async def main():
+    """Main entry point that runs the PDF processing loop."""
+    # database.setup_database()
+    print("PDF Processor service starting...")
+    cookies = load_cookies()
+    while True:
+        await process_pdf_queue(cookies)
+        print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
+        await asyncio.sleep(CHECK_INTERVAL_SECONDS)
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\nPDF Processor service stopped by user.")
--- a/docker/crawler/pdf_queue.txt
+++ b/docker/crawler/pdf_queue.txt
--- a/docker/crawler/requirements.txt
+++ b/docker/crawler/requirements.txt
@ -1,29 +1,110 @@
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+aiosqlite==0.21.0
+alphashape==1.3.1
+annotated-types==0.7.0
+anyio==4.10.0
+attrs==25.3.0
 beautifulsoup4==4.13.4
+Brotli==1.1.0
 cachetools==5.5.2
-certifi==2025.7.14
-charset-normalizer==3.4.2
-dotenv==0.9.9
+certifi==2025.8.3
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.3
+click==8.2.1
+click-log==0.4.0
+Crawl4AI==0.7.4
+cryptography==45.0.6
+distro==1.9.0
+fake-http-header==0.3.5
+fake-useragent==2.2.0
+filelock==3.19.1
+frozenlist==1.7.0
+fsspec==2025.7.0
 google==3.0.0
-google-ai-generativelanguage==0.1.0
+google-ai-generativelanguage==0.6.15
 google-api-core==2.25.1
-google-api-python-client==2.177.0
+google-api-python-client==2.179.0
 google-auth==2.40.3
 google-auth-httplib2==0.2.0
-google-generativeai==0.1.0rc1
+google-generativeai==0.8.5
 googleapis-common-protos==1.70.0
-grpcio==1.70.0
-grpcio-status==1.62.3
+greenlet==3.2.4
+grpcio==1.74.0
+grpcio-status==1.71.2
+h11==0.16.0
+h2==4.2.0
+hf-xet==1.1.8
+hpack==4.1.0
+httpcore==1.0.9
 httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.34.4
+humanize==4.12.3
+hyperframe==6.1.0
 idna==3.10
+importlib_metadata==8.7.0
+Jinja2==3.1.6
+jiter==0.10.0
+joblib==1.5.1
+jsonschema==4.25.1
+jsonschema-specifications==2025.4.1
+lark==1.2.2
+litellm==1.75.9
+lxml==5.4.0
+markdown-it-py==4.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+multidict==6.6.4
+networkx==3.5
+nltk==3.9.1
+numpy==2.3.2
+openai==1.100.2
+packaging==25.0
+patchright==1.52.5
+pillow==11.3.0
+playwright==1.54.0
+propcache==0.3.2
 proto-plus==1.26.1
-protobuf==4.25.8
+protobuf==5.29.5
+psutil==7.0.0
 pyasn1==0.6.1
-pyasn1-modules==0.4.2
-pyparsing==3.1.4
-python-dotenv==1.0.1
-requests==2.32.4
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+pyee==13.0.0
+Pygments==2.19.2
+pyOpenSSL==25.1.0
+pyparsing==3.2.3
+PyPDF2==3.0.1
+python-dotenv==1.1.1
+PyYAML==6.0.2
+rank-bm25==0.2.2
+referencing==0.36.2
+regex==2025.7.34
+requests==2.32.5
+rich==14.1.0
+rpds-py==0.27.0
 rsa==4.9.1
+rtree==1.4.1
+scipy==1.16.1
+shapely==2.1.1
+sniffio==1.3.1
+snowballstemmer==2.2.0
 soupsieve==2.7
-typing-extensions==4.13.2
-uritemplate==4.1.1
-urllib3==2.2.3
+tf-playwright-stealth==1.2.0
+tiktoken==0.11.0
+tokenizers==0.21.4
+tqdm==4.67.1
+trimesh==4.7.4
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+uritemplate==4.2.0
+urllib3==2.5.0
+xxhash==3.5.0
+yarl==1.20.1
+zipp==3.23.0