ploughshares/docker/crawler/pdf_crawler.py

93 lines
3.4 KiB
Python

import asyncio
import os
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
# import database
# --- Configuration ---
PDF_QUEUE_FILE = "pdf_queue.txt"
COOKIES_FILE = "marketline_cookies.json"
IMAGE_OUTPUT_DIR = "./extracted_images"
CHECK_INTERVAL_SECONDS = 60
def load_cookies():
"""Loads cookies from the JSON file if it exists."""
if not os.path.exists(COOKIES_FILE):
print("Warning: cookies.json not found. Crawling without authentication.")
return None
with open(COOKIES_FILE, 'r') as f:
cookies = json.load(f)
return {c['name']: c['value'] for c in cookies}
async def process_pdf_queue(cookies):
"""
Processes all unique URLs found in the PDF queue file.
"""
if not os.path.exists(PDF_QUEUE_FILE):
return
print("--- Checking PDF queue for new links ---")
with open(PDF_QUEUE_FILE, "r") as f:
urls_to_process = set(line.strip() for line in f if line.strip())
if not urls_to_process:
print("PDF queue is empty.")
return
print(f"Found {len(urls_to_process)} PDF(s) to process.")
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
pdf_scraping_cfg = PDFContentScrapingStrategy(
extract_images=True,
save_images_locally=True,
image_save_dir=IMAGE_OUTPUT_DIR,
)
pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
for url in urls_to_process:
print(f"\nProcessing PDF: {url}")
try:
result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
if not result.success:
print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
continue
content = result.markdown.raw_markdown if result.markdown else ""
print(f"PAGE CONTENT: {content}")
# page_id = database.add_crawled_page(result.url, content, 'pdf')
# if page_id and result.media and result.media.get("images"):
# print(f"Found {len(result.media['images'])} images in {result.url}")
# for img_info in result.media["images"]:
# database.add_crawled_image(
# page_id=page_id,
# page_number=img_info.get('page'),
# local_path=img_info.get('path'),
# img_format=img_info.get('format')
# )
print(f"Successfully processed and stored PDF: {result.url}")
except Exception as e:
print(f"A critical error occurred while processing PDF '{url}': {e}")
with open(PDF_QUEUE_FILE, "w") as f:
f.write("")
print("\n--- PDF queue processing finished ---")
async def main():
"""Main entry point that runs the PDF processing loop."""
# database.setup_database()
print("PDF Processor service starting...")
cookies = load_cookies()
while True:
await process_pdf_queue(cookies)
print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
await asyncio.sleep(CHECK_INTERVAL_SECONDS)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nPDF Processor service stopped by user.")