diff --git a/docker/crawler/.gitignore b/docker/crawler/.gitignore index 0407f89..8c12deb 100644 --- a/docker/crawler/.gitignore +++ b/docker/crawler/.gitignore @@ -1,2 +1,4 @@ .env -marketline_cookies.json \ No newline at end of file +marketline_cookies.json +venv/ +marketline_session/ \ No newline at end of file diff --git a/docker/crawler/analyze.py b/docker/crawler/analyze.py index 9338753..63b11b0 100644 --- a/docker/crawler/analyze.py +++ b/docker/crawler/analyze.py @@ -240,8 +240,8 @@ def main(): # basic required-field check (we want the API-required fields present) if not is_valid_transaction(tx): - print(" ⚠️ Skipping — missing required API fields in extracted transaction:", tx) - continue + print(" ⚠️ missing required API fields in extracted transaction:", tx) + #continue # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now # Save the item diff --git a/docker/crawler/crawl_results/.gitignore b/docker/crawler/crawl_results/.gitignore index 94a2dd1..a6c57f5 100644 --- a/docker/crawler/crawl_results/.gitignore +++ b/docker/crawler/crawl_results/.gitignore @@ -1 +1 @@ -*.json \ No newline at end of file +*.json diff --git a/docker/crawler/marketline_crawler.py b/docker/crawler/marketline_crawler.py index 3dcf5e1..3bd215a 100644 --- a/docker/crawler/marketline_crawler.py +++ b/docker/crawler/marketline_crawler.py @@ -1,186 +1,138 @@ +# NOT USED CURRENTLY +# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff +# the handoff is very similar anyways + import asyncio -from playwright.async_api import async_playwright, Page +from playwright.async_api import async_playwright import json import os from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.deep_crawling import BFSDeepCrawlStrategy -from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy -from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.deep_crawling.filters import URLPatternFilter from datetime import datetime +import logging +import time # --- CONFIGURATION --- - -# TODO: this will need to change for different organizations (ie univiersities) - -# make this the link for university login when accessing marketline -LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true" - -# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. +LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true" +LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue" HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home" - -# the root page to seed crawling -CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001" -# trying out another page -# CRAWLPAGE_URL = "https://www.defensenews.com/" - - -# name of file where cookies are saved +CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true" COOKIES_FILE = "marketline_cookies.json" # --- CRAWLER SETTINGS --- -DEPTH = 3 -COUNT = 100 - -# TODO: maybe make this list more comprehensive? +DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles) +COUNT = 50 +# UPDATED: Expanded keywords to better score article pages SCRAPER_KEYWORDS = [ - # Core Terms - "arms export", "arms sale", "arms trade", "weapons export", "weapons deal", - "military export", "defence contract", "defense contract", - - # Canadian Context - "canadian armed forces", "global affairs canada", "canadian defence", - "canadian military", "royal canadian navy", "royal canadian air force", - - # Equipment & Technology - "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", - "frigate", "fighter jet", "military aircraft", "surveillance", "radar", - "artillery", "munitions", "firearms", "aerospace", - - # Action & Policy Terms - "procurement", "acquisition", "military aid", "export permit", "itar" + "arms", "weapons", "military", "defence", "defense", "aerospace", + "canadian armed forces", "caf", "dnd", "global affairs canada", + "export", "sale", "contract", "procurement", "acquisition", + "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet", + "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery", + "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam" ] -# runs login process and saves cookies so that we can run the scraping with authentication -async def login_and_save_cookies(): +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +async def login_and_save_cookies(): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context() page = await context.new_page() - try: + logging.info("Starting login process... Please complete login in the browser.") await page.goto(LOGIN_URL) - await page.wait_for_url(HOMEPAGE_URL, timeout=300000) - - print("Login detected. Saving session cookies...") + time.sleep(45) + # await page.wait_for_url(HOMEPAGE_URL, timeout=300000) + logging.info("Login successful. Saving session cookies...") cookies = await context.cookies() with open(COOKIES_FILE, "w") as f: - json.dump(cookies, f) - - print("Cookies saved successfully!") - await crawl_with_saved_cookies() - + json.dump(cookies, f, indent=2) + logging.info(f"Cookies saved to '{COOKIES_FILE}'.") except Exception as e: - print(f"Login failed: {e}") - print("Error details:") - print(await page.content()) - + logging.error(f"Login failed: {e}") finally: await context.close() await browser.close() def save_results_to_json(successful_data, failed_pages): - """ - Saves the successful and failed crawl results into separate JSON files - in a dedicated directory. - """ - output_dir = "crawl_results" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = f"crawl_results_{timestamp}" os.makedirs(output_dir, exist_ok=True) - print(f"\n💾 Saving results to '{output_dir}' directory...") + logging.info(f"Saving results to '{output_dir}' directory...") - # Define file paths successful_file = os.path.join(output_dir, "successful_pages.json") - failed_file = os.path.join(output_dir, "failed_pages.json") - - # Save successfully scraped data with open(successful_file, "w", encoding="utf-8") as f: json.dump(successful_data, f, indent=4, ensure_ascii=False) - print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'") + logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'") - # Save failed pages if any if failed_pages: + failed_file = os.path.join(output_dir, "failed_pages.json") with open(failed_file, "w", encoding="utf-8") as f: json.dump(failed_pages, f, indent=4, ensure_ascii=False) - print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'") + logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'") - -# runs the crawler with the cookies collected during login async def crawl_with_saved_cookies(): - if not os.path.exists(COOKIES_FILE): - print("No cookies found. Please run login first.") - return + logging.warning("No cookies found. Running login first...") + await login_and_save_cookies() + if not os.path.exists(COOKIES_FILE): + logging.error("Login failed or was aborted. Exiting.") + return with open(COOKIES_FILE, "r") as f: - cookies = json.load(f) + try: + cookies = json.load(f) + except json.JSONDecodeError: + logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.") + return - browser_config = BrowserConfig(cookies=cookies) + logging.info(f"Loaded {len(cookies)} cookies for crawling.") + browser_config = BrowserConfig(cookies=cookies, headless=False) + + # NEW: Define a filter to only follow links that are news articles + article_filter = URLPatternFilter(patterns=[r"/news/"]) config = CrawlerRunConfig( - deep_crawl_strategy=BFSDeepCrawlStrategy( + + + deep_crawl_strategy=BestFirstCrawlingStrategy( max_depth=DEPTH, max_pages=COUNT, - url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,), + #url_scorer=(), + # url_filters=[article_filter] # UPDATED: Add the filter to the strategy ), - scraping_strategy=LXMLWebScrapingStrategy(), - # TODO: scrape the PDFs better - # scraping_strategy=PDFCrawlerStrategy(), - verbose=True, - stream=True, - page_timeout=30000 + # scraping_strategy= + # LXMLWebScrapingStrategy(), + verbose=True, stream=True, page_timeout=120000, + wait_until="domcontentloaded" ) - + successful_data = [] failed_pages = [] - + + logging.info("Starting crawl...") async with AsyncWebCrawler(config=browser_config) as crawler: + # We start with the list page. The filter will ensure we only crawl article links from it. async for result in await crawler.arun(CRAWLPAGE_URL, config=config): if result.success: - depth = result.metadata.get("depth", 0) + print("RESIULT:", result) score = result.metadata.get("score", 0) - - # here we could look at a few things, the HTML, markdown, raw text, etc. - scraped_content = result.markdown - - print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}") - # NEW: Print a preview of the content to confirm it's being scraped - print(f" 📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...") - + print(f"✅ Scraped: {result.url} (Score: {score:.2f})") successful_data.append({ - "url": result.url, - "content": scraped_content, - "depth": depth, - "score": round(score, 2) + "url": result.url, "content": result.markdown, + "depth": result.metadata.get("depth", 0), "score": round(score, 2), + "timestamp": datetime.now().isoformat() }) else: - failed_pages.append({ - 'url': result.url, - 'error': result.error_message, - 'depth': result.metadata.get("depth", 0) - }) print(f"❌ Failed: {result.url} - {result.error_message}") - - print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed") + failed_pages.append({'url': result.url, 'error': result.error_message}) + logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}") save_results_to_json(successful_data, failed_pages) - - # Analyze failures by depth - if failed_pages: - failure_by_depth = {} - for failure in failed_pages: - depth = failure['depth'] - failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1 - - print("❌ Failures by depth:") - for depth, count in sorted(failure_by_depth.items()): - print(f" Depth {depth}: {count} failures") if __name__ == "__main__": - # Choose which function to run - # 1. First, run the login function once to get your cookies - # asyncio.run(login_and_save_cookies()) - - # 2. Then, comment out the login line and run the crawl asyncio.run(crawl_with_saved_cookies()) \ No newline at end of file diff --git a/docker/crawler/marketline_handoff.py b/docker/crawler/marketline_handoff.py new file mode 100644 index 0000000..5fa08e3 --- /dev/null +++ b/docker/crawler/marketline_handoff.py @@ -0,0 +1,176 @@ +# like the crawler but with a session hand off instead of a cookies sharing approach +# opens non-headless browser, user logs in and does captchas and then hands off to scraper + +# more reliable, easier to debug and captcha resistant + +import asyncio +from itertools import chain +from playwright.async_api import async_playwright +import json +import os +from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BestFirstCrawlingStrategy +from crawl4ai.deep_crawling.filters import URLPatternFilter +from datetime import datetime +import logging +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer +from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter + +# --- CONFIGURATION --- +# MODIFIED: Only the login URL is needed for the initial navigation. +# The user will navigate to the crawl starting page manually. +LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline" + +# --- CRAWLER SETTINGS --- +DEPTH = 2 +COUNT = 50 +SCRAPER_KEYWORDS = [ + "arms", "weapons", "military", "defence", "defense", "aerospace", + "canadian armed forces", "caf", "dnd", "global affairs canada", + "export", "sale", "contract", "procurement", "acquisition", + "armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet", + "aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery", + "general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam" +] + +# class DebugFilter(BaseFilter): +# def apply(self, urls): +# print("\n=== LINKS BEFORE FILTERING ===") +# for u in urls: +# print(u) +# return urls # don’t drop anything + +include_words = URLPatternFilter(patterns=["*News*", "*news*"]) +deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True) + +# --- SETUP LOGGING --- +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def save_results_to_json(successful_data, failed_pages): + """Saves the crawl results to timestamped JSON files in a new directory.""" + # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # could timestamp this but im not cuz its easier to analyze. + # later we will prolly have one folder with all the timestamped files that we will go through regex'd + # for now well just overwrite. + output_dir = f"crawl_results" + os.makedirs(output_dir, exist_ok=True) + logging.info(f"Saving results to '{output_dir}' directory...") + + successful_file = os.path.join(output_dir, "successful_pages.json") + with open(successful_file, "w", encoding="utf-8") as f: + json.dump(successful_data, f, indent=4, ensure_ascii=False) + logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'") + + if failed_pages: + failed_file = os.path.join(output_dir, "failed_pages.json") + with open(failed_file, "w", encoding="utf-8") as f: + json.dump(failed_pages, f, indent=4, ensure_ascii=False) + logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'") + +async def main(): + """ + Main function to handle manual login, capture the session state from the + active tab, and then hand it off to the crawler. + """ + # --- STEP 1: Manual Login in a Temporary Browser --- + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + context = await browser.new_context() + page = await context.new_page() # This is the initial page + + logging.info("A browser window has opened. Please complete the following steps:") + logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.") + logging.info("2. Solve any CAPTCHAs or 2FA prompts.") + await page.goto(LOGIN_URL) + + input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n") + + # MODIFIED: Instead of using the original 'page' object, get the current active tab. + # This correctly handles cases where the login process opens a new tab. + print("ALL PAGES:") + for page in context.pages: + print("URL: ", page.url) + active_page = context.pages[-1] + start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1" + + logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}") + + # Capture the full session state (cookies, localStorage, etc.) + storage_state = await context.storage_state() + + # We no longer need this temporary browser. + await browser.close() + + # --- STEP 2: Configure and Run the Crawler with the Captured State --- + + # Pass the captured 'storage_state' dictionary to the crawler's browser configuration. + browser_config = BrowserConfig( + headless=False, + storage_state=storage_state # This injects your logged-in session. + ) + + scorer = KeywordRelevanceScorer( + keywords=SCRAPER_KEYWORDS, + weight=0.7 + ) + + filter = FilterChain([ + # DebugFilter(), + include_words, + deny_words + ]) + + # This configuration remains the same + config = CrawlerRunConfig( + + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=DEPTH, + max_pages=COUNT, + url_scorer=scorer, + filter_chain=filter + ), + verbose=True, + stream=True, + page_timeout=120000, + wait_until="domcontentloaded" + ) + + successful_data = [] + failed_pages = [] + + logging.info("Starting crawler with the captured session state...") + + async with AsyncWebCrawler(config=browser_config) as crawler: + # The crawler will now begin at the correct URL you navigated to. + async for result in await crawler.arun(start_url, config=config): + if result.success: + all_links = [ + l["href"] + for l in chain(result.links.get("internal", []), result.links.get("external", [])) + ] + + print(f"✅ Scraped: {result.url}") + print("Filtered links:") + + # Apply filters one URL at a time + for url in all_links: + if include_words.apply(url) and deny_words.apply(url): + print(" ->", url) + score = result.metadata.get("score", 0) + print(f"✅ Scraped: {result.url} (Score: {score:.2f})") + successful_data.append({ + "url": result.url, "content": result.markdown, + "depth": result.metadata.get("depth", 0), "score": round(score, 2), + "timestamp": datetime.now().isoformat() + }) + else: + print(f"❌ Failed: {result.url} - {result.error_message}") + failed_pages.append({'url': result.url, 'error': result.error_message}) + + logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}") + save_results_to_json(successful_data, failed_pages) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docker/crawler/pdf_crawler.py b/docker/crawler/pdf_crawler.py new file mode 100644 index 0000000..3f7a394 --- /dev/null +++ b/docker/crawler/pdf_crawler.py @@ -0,0 +1,93 @@ +import asyncio +import os +import json +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy +# import database + +# --- Configuration --- +PDF_QUEUE_FILE = "pdf_queue.txt" +COOKIES_FILE = "marketline_cookies.json" +IMAGE_OUTPUT_DIR = "./extracted_images" +CHECK_INTERVAL_SECONDS = 60 + +def load_cookies(): + """Loads cookies from the JSON file if it exists.""" + if not os.path.exists(COOKIES_FILE): + print("Warning: cookies.json not found. Crawling without authentication.") + return None + with open(COOKIES_FILE, 'r') as f: + cookies = json.load(f) + return {c['name']: c['value'] for c in cookies} + +async def process_pdf_queue(cookies): + """ + Processes all unique URLs found in the PDF queue file. + """ + if not os.path.exists(PDF_QUEUE_FILE): + return + + print("--- Checking PDF queue for new links ---") + with open(PDF_QUEUE_FILE, "r") as f: + urls_to_process = set(line.strip() for line in f if line.strip()) + + if not urls_to_process: + print("PDF queue is empty.") + return + + print(f"Found {len(urls_to_process)} PDF(s) to process.") + os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True) + + pdf_scraping_cfg = PDFContentScrapingStrategy( + extract_images=True, + save_images_locally=True, + image_save_dir=IMAGE_OUTPUT_DIR, + ) + pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg) + + async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler: + for url in urls_to_process: + print(f"\nProcessing PDF: {url}") + try: + result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies) + if not result.success: + print(f"Failed to process PDF {result.url}. Error: {result.error_message}") + continue + + content = result.markdown.raw_markdown if result.markdown else "" + print(f"PAGE CONTENT: {content}") + # page_id = database.add_crawled_page(result.url, content, 'pdf') + + # if page_id and result.media and result.media.get("images"): + # print(f"Found {len(result.media['images'])} images in {result.url}") + # for img_info in result.media["images"]: + # database.add_crawled_image( + # page_id=page_id, + # page_number=img_info.get('page'), + # local_path=img_info.get('path'), + # img_format=img_info.get('format') + # ) + print(f"Successfully processed and stored PDF: {result.url}") + + except Exception as e: + print(f"A critical error occurred while processing PDF '{url}': {e}") + + with open(PDF_QUEUE_FILE, "w") as f: + f.write("") + print("\n--- PDF queue processing finished ---") + +async def main(): + """Main entry point that runs the PDF processing loop.""" + # database.setup_database() + print("PDF Processor service starting...") + cookies = load_cookies() + while True: + await process_pdf_queue(cookies) + print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.") + await asyncio.sleep(CHECK_INTERVAL_SECONDS) + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\nPDF Processor service stopped by user.") \ No newline at end of file diff --git a/docker/crawler/pdf_queue.txt b/docker/crawler/pdf_queue.txt new file mode 100644 index 0000000..e69de29 diff --git a/docker/crawler/requirements.txt b/docker/crawler/requirements.txt index 9f3862b..40a72af 100644 --- a/docker/crawler/requirements.txt +++ b/docker/crawler/requirements.txt @@ -1,29 +1,110 @@ +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +aiosqlite==0.21.0 +alphashape==1.3.1 +annotated-types==0.7.0 +anyio==4.10.0 +attrs==25.3.0 beautifulsoup4==4.13.4 +Brotli==1.1.0 cachetools==5.5.2 -certifi==2025.7.14 -charset-normalizer==3.4.2 -dotenv==0.9.9 +certifi==2025.8.3 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.3 +click==8.2.1 +click-log==0.4.0 +Crawl4AI==0.7.4 +cryptography==45.0.6 +distro==1.9.0 +fake-http-header==0.3.5 +fake-useragent==2.2.0 +filelock==3.19.1 +frozenlist==1.7.0 +fsspec==2025.7.0 google==3.0.0 -google-ai-generativelanguage==0.1.0 +google-ai-generativelanguage==0.6.15 google-api-core==2.25.1 -google-api-python-client==2.177.0 +google-api-python-client==2.179.0 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-generativeai==0.1.0rc1 +google-generativeai==0.8.5 googleapis-common-protos==1.70.0 -grpcio==1.70.0 -grpcio-status==1.62.3 +greenlet==3.2.4 +grpcio==1.74.0 +grpcio-status==1.71.2 +h11==0.16.0 +h2==4.2.0 +hf-xet==1.1.8 +hpack==4.1.0 +httpcore==1.0.9 httplib2==0.22.0 +httpx==0.28.1 +huggingface-hub==0.34.4 +humanize==4.12.3 +hyperframe==6.1.0 idna==3.10 +importlib_metadata==8.7.0 +Jinja2==3.1.6 +jiter==0.10.0 +joblib==1.5.1 +jsonschema==4.25.1 +jsonschema-specifications==2025.4.1 +lark==1.2.2 +litellm==1.75.9 +lxml==5.4.0 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +multidict==6.6.4 +networkx==3.5 +nltk==3.9.1 +numpy==2.3.2 +openai==1.100.2 +packaging==25.0 +patchright==1.52.5 +pillow==11.3.0 +playwright==1.54.0 +propcache==0.3.2 proto-plus==1.26.1 -protobuf==4.25.8 +protobuf==5.29.5 +psutil==7.0.0 pyasn1==0.6.1 -pyasn1-modules==0.4.2 -pyparsing==3.1.4 -python-dotenv==1.0.1 -requests==2.32.4 +pyasn1_modules==0.4.2 +pycparser==2.22 +pydantic==2.11.7 +pydantic_core==2.33.2 +pyee==13.0.0 +Pygments==2.19.2 +pyOpenSSL==25.1.0 +pyparsing==3.2.3 +PyPDF2==3.0.1 +python-dotenv==1.1.1 +PyYAML==6.0.2 +rank-bm25==0.2.2 +referencing==0.36.2 +regex==2025.7.34 +requests==2.32.5 +rich==14.1.0 +rpds-py==0.27.0 rsa==4.9.1 +rtree==1.4.1 +scipy==1.16.1 +shapely==2.1.1 +sniffio==1.3.1 +snowballstemmer==2.2.0 soupsieve==2.7 -typing-extensions==4.13.2 -uritemplate==4.1.1 -urllib3==2.2.3 +tf-playwright-stealth==1.2.0 +tiktoken==0.11.0 +tokenizers==0.21.4 +tqdm==4.67.1 +trimesh==4.7.4 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +uritemplate==4.2.0 +urllib3==2.5.0 +xxhash==3.5.0 +yarl==1.20.1 +zipp==3.23.0