ploughshares/docker/crawler/pdf_crawler.py

import asyncio
import os
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
# import database

# --- Configuration ---
PDF_QUEUE_FILE = "pdf_queue.txt"
COOKIES_FILE = "marketline_cookies.json"
IMAGE_OUTPUT_DIR = "./extracted_images"
CHECK_INTERVAL_SECONDS = 60

def load_cookies():
    """Loads cookies from the JSON file if it exists."""
    if not os.path.exists(COOKIES_FILE):
        print("Warning: cookies.json not found. Crawling without authentication.")
        return None
    with open(COOKIES_FILE, 'r') as f:
        cookies = json.load(f)
    return {c['name']: c['value'] for c in cookies}

async def process_pdf_queue(cookies):
    """
    Processes all unique URLs found in the PDF queue file.
    """
    if not os.path.exists(PDF_QUEUE_FILE):
        return

    print("--- Checking PDF queue for new links ---")
    with open(PDF_QUEUE_FILE, "r") as f:
        urls_to_process = set(line.strip() for line in f if line.strip())

    if not urls_to_process:
        print("PDF queue is empty.")
        return

    print(f"Found {len(urls_to_process)} PDF(s) to process.")
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)

    pdf_scraping_cfg = PDFContentScrapingStrategy(
        extract_images=True,
        save_images_locally=True,
        image_save_dir=IMAGE_OUTPUT_DIR,
    )
    pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)

    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
        for url in urls_to_process:
            print(f"\nProcessing PDF: {url}")
            try:
                result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
                if not result.success:
                    print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
                    continue

                content = result.markdown.raw_markdown if result.markdown else ""
                print(f"PAGE CONTENT: {content}")
                # page_id = database.add_crawled_page(result.url, content, 'pdf')

                # if page_id and result.media and result.media.get("images"):
                #     print(f"Found {len(result.media['images'])} images in {result.url}")
                    # for img_info in result.media["images"]:
                        # database.add_crawled_image(
                        #     page_id=page_id,
                        #     page_number=img_info.get('page'),
                        #     local_path=img_info.get('path'),
                        #     img_format=img_info.get('format')
                        # )
                print(f"Successfully processed and stored PDF: {result.url}")

            except Exception as e:
                print(f"A critical error occurred while processing PDF '{url}': {e}")

    with open(PDF_QUEUE_FILE, "w") as f:
        f.write("")
    print("\n--- PDF queue processing finished ---")

async def main():
    """Main entry point that runs the PDF processing loop."""
    # database.setup_database()
    print("PDF Processor service starting...")
    cookies = load_cookies()
    while True:
        await process_pdf_queue(cookies)
        print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
        await asyncio.sleep(CHECK_INTERVAL_SECONDS)

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("\nPDF Processor service stopped by user.")