import asyncio import os import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy # import database # --- Configuration --- PDF_QUEUE_FILE = "pdf_queue.txt" COOKIES_FILE = "marketline_cookies.json" IMAGE_OUTPUT_DIR = "./extracted_images" CHECK_INTERVAL_SECONDS = 60 def load_cookies(): """Loads cookies from the JSON file if it exists.""" if not os.path.exists(COOKIES_FILE): print("Warning: cookies.json not found. Crawling without authentication.") return None with open(COOKIES_FILE, 'r') as f: cookies = json.load(f) return {c['name']: c['value'] for c in cookies} async def process_pdf_queue(cookies): """ Processes all unique URLs found in the PDF queue file. """ if not os.path.exists(PDF_QUEUE_FILE): return print("--- Checking PDF queue for new links ---") with open(PDF_QUEUE_FILE, "r") as f: urls_to_process = set(line.strip() for line in f if line.strip()) if not urls_to_process: print("PDF queue is empty.") return print(f"Found {len(urls_to_process)} PDF(s) to process.") os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True) pdf_scraping_cfg = PDFContentScrapingStrategy( extract_images=True, save_images_locally=True, image_save_dir=IMAGE_OUTPUT_DIR, ) pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg) async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler: for url in urls_to_process: print(f"\nProcessing PDF: {url}") try: result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies) if not result.success: print(f"Failed to process PDF {result.url}. Error: {result.error_message}") continue content = result.markdown.raw_markdown if result.markdown else "" print(f"PAGE CONTENT: {content}") # page_id = database.add_crawled_page(result.url, content, 'pdf') # if page_id and result.media and result.media.get("images"): # print(f"Found {len(result.media['images'])} images in {result.url}") # for img_info in result.media["images"]: # database.add_crawled_image( # page_id=page_id, # page_number=img_info.get('page'), # local_path=img_info.get('path'), # img_format=img_info.get('format') # ) print(f"Successfully processed and stored PDF: {result.url}") except Exception as e: print(f"A critical error occurred while processing PDF '{url}': {e}") with open(PDF_QUEUE_FILE, "w") as f: f.write("") print("\n--- PDF queue processing finished ---") async def main(): """Main entry point that runs the PDF processing loop.""" # database.setup_database() print("PDF Processor service starting...") cookies = load_cookies() while True: await process_pdf_queue(cookies) print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.") await asyncio.sleep(CHECK_INTERVAL_SECONDS) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: print("\nPDF Processor service stopped by user.")