93 lines
3.4 KiB
Python
93 lines
3.4 KiB
Python
import asyncio
|
|
import os
|
|
import json
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
|
# import database
|
|
|
|
# --- Configuration ---
|
|
PDF_QUEUE_FILE = "pdf_queue.txt"
|
|
COOKIES_FILE = "marketline_cookies.json"
|
|
IMAGE_OUTPUT_DIR = "./extracted_images"
|
|
CHECK_INTERVAL_SECONDS = 60
|
|
|
|
def load_cookies():
|
|
"""Loads cookies from the JSON file if it exists."""
|
|
if not os.path.exists(COOKIES_FILE):
|
|
print("Warning: cookies.json not found. Crawling without authentication.")
|
|
return None
|
|
with open(COOKIES_FILE, 'r') as f:
|
|
cookies = json.load(f)
|
|
return {c['name']: c['value'] for c in cookies}
|
|
|
|
async def process_pdf_queue(cookies):
|
|
"""
|
|
Processes all unique URLs found in the PDF queue file.
|
|
"""
|
|
if not os.path.exists(PDF_QUEUE_FILE):
|
|
return
|
|
|
|
print("--- Checking PDF queue for new links ---")
|
|
with open(PDF_QUEUE_FILE, "r") as f:
|
|
urls_to_process = set(line.strip() for line in f if line.strip())
|
|
|
|
if not urls_to_process:
|
|
print("PDF queue is empty.")
|
|
return
|
|
|
|
print(f"Found {len(urls_to_process)} PDF(s) to process.")
|
|
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
|
|
|
|
pdf_scraping_cfg = PDFContentScrapingStrategy(
|
|
extract_images=True,
|
|
save_images_locally=True,
|
|
image_save_dir=IMAGE_OUTPUT_DIR,
|
|
)
|
|
pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
|
|
|
|
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
|
|
for url in urls_to_process:
|
|
print(f"\nProcessing PDF: {url}")
|
|
try:
|
|
result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
|
|
if not result.success:
|
|
print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
|
|
continue
|
|
|
|
content = result.markdown.raw_markdown if result.markdown else ""
|
|
print(f"PAGE CONTENT: {content}")
|
|
# page_id = database.add_crawled_page(result.url, content, 'pdf')
|
|
|
|
# if page_id and result.media and result.media.get("images"):
|
|
# print(f"Found {len(result.media['images'])} images in {result.url}")
|
|
# for img_info in result.media["images"]:
|
|
# database.add_crawled_image(
|
|
# page_id=page_id,
|
|
# page_number=img_info.get('page'),
|
|
# local_path=img_info.get('path'),
|
|
# img_format=img_info.get('format')
|
|
# )
|
|
print(f"Successfully processed and stored PDF: {result.url}")
|
|
|
|
except Exception as e:
|
|
print(f"A critical error occurred while processing PDF '{url}': {e}")
|
|
|
|
with open(PDF_QUEUE_FILE, "w") as f:
|
|
f.write("")
|
|
print("\n--- PDF queue processing finished ---")
|
|
|
|
async def main():
|
|
"""Main entry point that runs the PDF processing loop."""
|
|
# database.setup_database()
|
|
print("PDF Processor service starting...")
|
|
cookies = load_cookies()
|
|
while True:
|
|
await process_pdf_queue(cookies)
|
|
print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
|
|
await asyncio.sleep(CHECK_INTERVAL_SECONDS)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.run(main())
|
|
except KeyboardInterrupt:
|
|
print("\nPDF Processor service stopped by user.") |