scraper handoff + updated reqs

This commit is contained in:
coleWesterveld 2025-09-03 19:31:14 -04:00
parent 725f028d69
commit b1f3115999
8 changed files with 443 additions and 139 deletions

View File

@ -1,2 +1,4 @@
.env
marketline_cookies.json
marketline_cookies.json
venv/
marketline_session/

View File

@ -240,8 +240,8 @@ def main():
# basic required-field check (we want the API-required fields present)
if not is_valid_transaction(tx):
print(" ⚠️ Skipping — missing required API fields in extracted transaction:", tx)
continue
print(" ⚠️ missing required API fields in extracted transaction:", tx)
#continue
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
# Save the item

View File

@ -1 +1 @@
*.json
*.json

View File

@ -1,186 +1,138 @@
# NOT USED CURRENTLY
# temporairily, if not permanently switched to using the handoff file to get around captchas and stuff
# the handoff is very similar anyways
import asyncio
from playwright.async_api import async_playwright, Page
from playwright.async_api import async_playwright
import json
import os
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy, BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.deep_crawling.filters import URLPatternFilter
from datetime import datetime
import logging
import time
# --- CONFIGURATION ---
# TODO: this will need to change for different organizations (ie univiersities)
# make this the link for university login when accessing marketline
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies.
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Fs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fVLBbsIwDP2VKvc2aaBAI4rE4DAktiFgO%2BwypcGFSGnSxenY%2Fn4FNo1dOFp%2Bfs%2Fv2WOUtWnEtA0Hu4b3FjBEn7WxKM6NgrTeCidRo7CyBhRBic30YSl4wkTjXXDKGRJNEcEH7ezMWWxr8BvwH1rB83pZkEMIDQpKZSeSGF0mrdu3YJpDoiTt6v0RG7o56LJ0BjoIoqMnDU5XT5stiebdUtrKE%2F0fmXF7bZNaK%2B%2FQVcFZoy0kytW0hAGXKS9jruQu7ucyjWVWyXiUVf2ql2YDOdzRkztOosW8IG9pztRI5izLh9BPcz5i%2BY4N0q6oFAz7vQ6G2MLCYpA2FIQznsUsj1lvy5hgPcGGryRa%2FYRxp%2B1O2%2F3t5MoLCMX9druKLzZfwOPZYgcgk%2FFpQ3EW9lcXuU0rf89AJjdDx2ZMr%2FgvYo147AgX85UzWn1FU2PcceZBBihISujkMvL%2FVybf&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Dsa3ysaynan5loqzpdleq8f1v4ji31utw%26csrfmiddlewaretoken%3DVnbRmbY0l1tnxKqHdJnkZ1yYGlJgPueoiNOMivmejDxCbeVl3A0iV5FdEFmO3DgG%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue"
HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
# the root page to seed crawling
CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
# trying out another page
# CRAWLPAGE_URL = "https://www.defensenews.com/"
# name of file where cookies are saved
CRAWLPAGE_URL = "https://advantage.marketline.com/News/NewsListing?q[]=aerospace+and+defense&IsSearchApi=true"
COOKIES_FILE = "marketline_cookies.json"
# --- CRAWLER SETTINGS ---
DEPTH = 3
COUNT = 100
# TODO: maybe make this list more comprehensive?
DEPTH = 2 # A depth of 2 is enough: Page 1 (List) -> Page 2 (Articles)
COUNT = 50
# UPDATED: Expanded keywords to better score article pages
SCRAPER_KEYWORDS = [
# Core Terms
"arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
"military export", "defence contract", "defense contract",
# Canadian Context
"canadian armed forces", "global affairs canada", "canadian defence",
"canadian military", "royal canadian navy", "royal canadian air force",
# Equipment & Technology
"armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship",
"frigate", "fighter jet", "military aircraft", "surveillance", "radar",
"artillery", "munitions", "firearms", "aerospace",
# Action & Policy Terms
"procurement", "acquisition", "military aid", "export permit", "itar"
"arms", "weapons", "military", "defence", "defense", "aerospace",
"canadian armed forces", "caf", "dnd", "global affairs canada",
"export", "sale", "contract", "procurement", "acquisition",
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
]
# runs login process and saves cookies so that we can run the scraping with authentication
async def login_and_save_cookies():
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
async def login_and_save_cookies():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()
try:
logging.info("Starting login process... Please complete login in the browser.")
await page.goto(LOGIN_URL)
await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
print("Login detected. Saving session cookies...")
time.sleep(45)
# await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
logging.info("Login successful. Saving session cookies...")
cookies = await context.cookies()
with open(COOKIES_FILE, "w") as f:
json.dump(cookies, f)
print("Cookies saved successfully!")
await crawl_with_saved_cookies()
json.dump(cookies, f, indent=2)
logging.info(f"Cookies saved to '{COOKIES_FILE}'.")
except Exception as e:
print(f"Login failed: {e}")
print("Error details:")
print(await page.content())
logging.error(f"Login failed: {e}")
finally:
await context.close()
await browser.close()
def save_results_to_json(successful_data, failed_pages):
"""
Saves the successful and failed crawl results into separate JSON files
in a dedicated directory.
"""
output_dir = "crawl_results"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"crawl_results_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"\n💾 Saving results to '{output_dir}' directory...")
logging.info(f"Saving results to '{output_dir}' directory...")
# Define file paths
successful_file = os.path.join(output_dir, "successful_pages.json")
failed_file = os.path.join(output_dir, "failed_pages.json")
# Save successfully scraped data
with open(successful_file, "w", encoding="utf-8") as f:
json.dump(successful_data, f, indent=4, ensure_ascii=False)
print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'")
logging.info(f"Saved {len(successful_data)} pages to '{successful_file}'")
# Save failed pages if any
if failed_pages:
failed_file = os.path.join(output_dir, "failed_pages.json")
with open(failed_file, "w", encoding="utf-8") as f:
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
# runs the crawler with the cookies collected during login
async def crawl_with_saved_cookies():
if not os.path.exists(COOKIES_FILE):
print("No cookies found. Please run login first.")
return
logging.warning("No cookies found. Running login first...")
await login_and_save_cookies()
if not os.path.exists(COOKIES_FILE):
logging.error("Login failed or was aborted. Exiting.")
return
with open(COOKIES_FILE, "r") as f:
cookies = json.load(f)
try:
cookies = json.load(f)
except json.JSONDecodeError:
logging.error(f"Error reading cookies file. Please delete '{COOKIES_FILE}' and run again.")
return
browser_config = BrowserConfig(cookies=cookies)
logging.info(f"Loaded {len(cookies)} cookies for crawling.")
browser_config = BrowserConfig(cookies=cookies, headless=False)
# NEW: Define a filter to only follow links that are news articles
article_filter = URLPatternFilter(patterns=[r"/news/"])
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=DEPTH,
max_pages=COUNT,
url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
#url_scorer=(),
# url_filters=[article_filter] # UPDATED: Add the filter to the strategy
),
scraping_strategy=LXMLWebScrapingStrategy(),
# TODO: scrape the PDFs better
# scraping_strategy=PDFCrawlerStrategy(),
verbose=True,
stream=True,
page_timeout=30000
# scraping_strategy=
# LXMLWebScrapingStrategy(),
verbose=True, stream=True, page_timeout=120000,
wait_until="domcontentloaded"
)
successful_data = []
failed_pages = []
logging.info("Starting crawl...")
async with AsyncWebCrawler(config=browser_config) as crawler:
# We start with the list page. The filter will ensure we only crawl article links from it.
async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
if result.success:
depth = result.metadata.get("depth", 0)
print("RESIULT:", result)
score = result.metadata.get("score", 0)
# here we could look at a few things, the HTML, markdown, raw text, etc.
scraped_content = result.markdown
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
# NEW: Print a preview of the content to confirm it's being scraped
print(f" 📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
successful_data.append({
"url": result.url,
"content": scraped_content,
"depth": depth,
"score": round(score, 2)
"url": result.url, "content": result.markdown,
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
"timestamp": datetime.now().isoformat()
})
else:
failed_pages.append({
'url': result.url,
'error': result.error_message,
'depth': result.metadata.get("depth", 0)
})
print(f"❌ Failed: {result.url} - {result.error_message}")
print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
failed_pages.append({'url': result.url, 'error': result.error_message})
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
save_results_to_json(successful_data, failed_pages)
# Analyze failures by depth
if failed_pages:
failure_by_depth = {}
for failure in failed_pages:
depth = failure['depth']
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
print("❌ Failures by depth:")
for depth, count in sorted(failure_by_depth.items()):
print(f" Depth {depth}: {count} failures")
if __name__ == "__main__":
# Choose which function to run
# 1. First, run the login function once to get your cookies
# asyncio.run(login_and_save_cookies())
# 2. Then, comment out the login line and run the crawl
asyncio.run(crawl_with_saved_cookies())

View File

@ -0,0 +1,176 @@
# like the crawler but with a session hand off instead of a cookies sharing approach
# opens non-headless browser, user logs in and does captchas and then hands off to scraper
# more reliable, easier to debug and captcha resistant
import asyncio
from itertools import chain
from playwright.async_api import async_playwright
import json
import os
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import URLPatternFilter
from datetime import datetime
import logging
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
# --- CONFIGURATION ---
# MODIFIED: Only the login URL is needed for the initial navigation.
# The user will navigate to the crawl starting page manually.
LOGIN_URL = "https://guides.lib.uoguelph.ca/az/databases?q=marketline"
# --- CRAWLER SETTINGS ---
DEPTH = 2
COUNT = 50
SCRAPER_KEYWORDS = [
"arms", "weapons", "military", "defence", "defense", "aerospace",
"canadian armed forces", "caf", "dnd", "global affairs canada",
"export", "sale", "contract", "procurement", "acquisition",
"armoured vehicle", "lav", "naval", "warship", "frigate", "fighter jet",
"aircraft", "surveillance", "radar", "drone", "uav", "missile", "artillery",
"general dynamics", "lockheed martin", "bombardier", "cae", "thales canada", "wescam"
]
# class DebugFilter(BaseFilter):
# def apply(self, urls):
# print("\n=== LINKS BEFORE FILTERING ===")
# for u in urls:
# print(u)
# return urls # dont drop anything
include_words = URLPatternFilter(patterns=["*News*", "*news*"])
deny_words = URLPatternFilter(patterns=["*Analysis*", "*Sectors*", "*Commentsandopinions*", "*Dashboard*", "*Homepage*"], reverse=True)
# --- SETUP LOGGING ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def save_results_to_json(successful_data, failed_pages):
"""Saves the crawl results to timestamped JSON files in a new directory."""
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# could timestamp this but im not cuz its easier to analyze.
# later we will prolly have one folder with all the timestamped files that we will go through regex'd
# for now well just overwrite.
output_dir = f"crawl_results"
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Saving results to '{output_dir}' directory...")
successful_file = os.path.join(output_dir, "successful_pages.json")
with open(successful_file, "w", encoding="utf-8") as f:
json.dump(successful_data, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(successful_data)} successful pages to '{successful_file}'")
if failed_pages:
failed_file = os.path.join(output_dir, "failed_pages.json")
with open(failed_file, "w", encoding="utf-8") as f:
json.dump(failed_pages, f, indent=4, ensure_ascii=False)
logging.info(f"Saved {len(failed_pages)} failed pages to '{failed_file}'")
async def main():
"""
Main function to handle manual login, capture the session state from the
active tab, and then hand it off to the crawler.
"""
# --- STEP 1: Manual Login in a Temporary Browser ---
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page() # This is the initial page
logging.info("A browser window has opened. Please complete the following steps:")
logging.info(f"1. Log in and navigate to the exact page where you want the crawl to begin.")
logging.info("2. Solve any CAPTCHAs or 2FA prompts.")
await page.goto(LOGIN_URL)
input("\n>>> Press Enter in this console window once you are logged in and on the starting page... <<<\n")
# MODIFIED: Instead of using the original 'page' object, get the current active tab.
# This correctly handles cases where the login process opens a new tab.
print("ALL PAGES:")
for page in context.pages:
print("URL: ", page.url)
active_page = context.pages[-1]
start_url = "https://advantage.marketline.com/News/NewsListing?q%5B%5D=aerospace+and+defence&IsSearchApi=true&exactword=1"
logging.info(f"Login complete. Using active tab URL to start crawl: {start_url}")
# Capture the full session state (cookies, localStorage, etc.)
storage_state = await context.storage_state()
# We no longer need this temporary browser.
await browser.close()
# --- STEP 2: Configure and Run the Crawler with the Captured State ---
# Pass the captured 'storage_state' dictionary to the crawler's browser configuration.
browser_config = BrowserConfig(
headless=False,
storage_state=storage_state # This injects your logged-in session.
)
scorer = KeywordRelevanceScorer(
keywords=SCRAPER_KEYWORDS,
weight=0.7
)
filter = FilterChain([
# DebugFilter(),
include_words,
deny_words
])
# This configuration remains the same
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=DEPTH,
max_pages=COUNT,
url_scorer=scorer,
filter_chain=filter
),
verbose=True,
stream=True,
page_timeout=120000,
wait_until="domcontentloaded"
)
successful_data = []
failed_pages = []
logging.info("Starting crawler with the captured session state...")
async with AsyncWebCrawler(config=browser_config) as crawler:
# The crawler will now begin at the correct URL you navigated to.
async for result in await crawler.arun(start_url, config=config):
if result.success:
all_links = [
l["href"]
for l in chain(result.links.get("internal", []), result.links.get("external", []))
]
print(f"✅ Scraped: {result.url}")
print("Filtered links:")
# Apply filters one URL at a time
for url in all_links:
if include_words.apply(url) and deny_words.apply(url):
print(" ->", url)
score = result.metadata.get("score", 0)
print(f"✅ Scraped: {result.url} (Score: {score:.2f})")
successful_data.append({
"url": result.url, "content": result.markdown,
"depth": result.metadata.get("depth", 0), "score": round(score, 2),
"timestamp": datetime.now().isoformat()
})
else:
print(f"❌ Failed: {result.url} - {result.error_message}")
failed_pages.append({'url': result.url, 'error': result.error_message})
logging.info(f"Crawl completed! Successful: {len(successful_data)}, Failed: {len(failed_pages)}")
save_results_to_json(successful_data, failed_pages)
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,93 @@
import asyncio
import os
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
# import database
# --- Configuration ---
PDF_QUEUE_FILE = "pdf_queue.txt"
COOKIES_FILE = "marketline_cookies.json"
IMAGE_OUTPUT_DIR = "./extracted_images"
CHECK_INTERVAL_SECONDS = 60
def load_cookies():
"""Loads cookies from the JSON file if it exists."""
if not os.path.exists(COOKIES_FILE):
print("Warning: cookies.json not found. Crawling without authentication.")
return None
with open(COOKIES_FILE, 'r') as f:
cookies = json.load(f)
return {c['name']: c['value'] for c in cookies}
async def process_pdf_queue(cookies):
"""
Processes all unique URLs found in the PDF queue file.
"""
if not os.path.exists(PDF_QUEUE_FILE):
return
print("--- Checking PDF queue for new links ---")
with open(PDF_QUEUE_FILE, "r") as f:
urls_to_process = set(line.strip() for line in f if line.strip())
if not urls_to_process:
print("PDF queue is empty.")
return
print(f"Found {len(urls_to_process)} PDF(s) to process.")
os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
pdf_scraping_cfg = PDFContentScrapingStrategy(
extract_images=True,
save_images_locally=True,
image_save_dir=IMAGE_OUTPUT_DIR,
)
pdf_run_cfg = CrawlerRunConfig(scraping_strategy=pdf_scraping_cfg)
async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
for url in urls_to_process:
print(f"\nProcessing PDF: {url}")
try:
result = await crawler.arun(url=url, config=pdf_run_cfg, cookies=cookies)
if not result.success:
print(f"Failed to process PDF {result.url}. Error: {result.error_message}")
continue
content = result.markdown.raw_markdown if result.markdown else ""
print(f"PAGE CONTENT: {content}")
# page_id = database.add_crawled_page(result.url, content, 'pdf')
# if page_id and result.media and result.media.get("images"):
# print(f"Found {len(result.media['images'])} images in {result.url}")
# for img_info in result.media["images"]:
# database.add_crawled_image(
# page_id=page_id,
# page_number=img_info.get('page'),
# local_path=img_info.get('path'),
# img_format=img_info.get('format')
# )
print(f"Successfully processed and stored PDF: {result.url}")
except Exception as e:
print(f"A critical error occurred while processing PDF '{url}': {e}")
with open(PDF_QUEUE_FILE, "w") as f:
f.write("")
print("\n--- PDF queue processing finished ---")
async def main():
"""Main entry point that runs the PDF processing loop."""
# database.setup_database()
print("PDF Processor service starting...")
cookies = load_cookies()
while True:
await process_pdf_queue(cookies)
print(f"Queue check finished. Waiting {CHECK_INTERVAL_SECONDS}s for next check.")
await asyncio.sleep(CHECK_INTERVAL_SECONDS)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\nPDF Processor service stopped by user.")

View File

View File

@ -1,29 +1,110 @@
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
aiosqlite==0.21.0
alphashape==1.3.1
annotated-types==0.7.0
anyio==4.10.0
attrs==25.3.0
beautifulsoup4==4.13.4
Brotli==1.1.0
cachetools==5.5.2
certifi==2025.7.14
charset-normalizer==3.4.2
dotenv==0.9.9
certifi==2025.8.3
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.3
click==8.2.1
click-log==0.4.0
Crawl4AI==0.7.4
cryptography==45.0.6
distro==1.9.0
fake-http-header==0.3.5
fake-useragent==2.2.0
filelock==3.19.1
frozenlist==1.7.0
fsspec==2025.7.0
google==3.0.0
google-ai-generativelanguage==0.1.0
google-ai-generativelanguage==0.6.15
google-api-core==2.25.1
google-api-python-client==2.177.0
google-api-python-client==2.179.0
google-auth==2.40.3
google-auth-httplib2==0.2.0
google-generativeai==0.1.0rc1
google-generativeai==0.8.5
googleapis-common-protos==1.70.0
grpcio==1.70.0
grpcio-status==1.62.3
greenlet==3.2.4
grpcio==1.74.0
grpcio-status==1.71.2
h11==0.16.0
h2==4.2.0
hf-xet==1.1.8
hpack==4.1.0
httpcore==1.0.9
httplib2==0.22.0
httpx==0.28.1
huggingface-hub==0.34.4
humanize==4.12.3
hyperframe==6.1.0
idna==3.10
importlib_metadata==8.7.0
Jinja2==3.1.6
jiter==0.10.0
joblib==1.5.1
jsonschema==4.25.1
jsonschema-specifications==2025.4.1
lark==1.2.2
litellm==1.75.9
lxml==5.4.0
markdown-it-py==4.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
multidict==6.6.4
networkx==3.5
nltk==3.9.1
numpy==2.3.2
openai==1.100.2
packaging==25.0
patchright==1.52.5
pillow==11.3.0
playwright==1.54.0
propcache==0.3.2
proto-plus==1.26.1
protobuf==4.25.8
protobuf==5.29.5
psutil==7.0.0
pyasn1==0.6.1
pyasn1-modules==0.4.2
pyparsing==3.1.4
python-dotenv==1.0.1
requests==2.32.4
pyasn1_modules==0.4.2
pycparser==2.22
pydantic==2.11.7
pydantic_core==2.33.2
pyee==13.0.0
Pygments==2.19.2
pyOpenSSL==25.1.0
pyparsing==3.2.3
PyPDF2==3.0.1
python-dotenv==1.1.1
PyYAML==6.0.2
rank-bm25==0.2.2
referencing==0.36.2
regex==2025.7.34
requests==2.32.5
rich==14.1.0
rpds-py==0.27.0
rsa==4.9.1
rtree==1.4.1
scipy==1.16.1
shapely==2.1.1
sniffio==1.3.1
snowballstemmer==2.2.0
soupsieve==2.7
typing-extensions==4.13.2
uritemplate==4.1.1
urllib3==2.2.3
tf-playwright-stealth==1.2.0
tiktoken==0.11.0
tokenizers==0.21.4
tqdm==4.67.1
trimesh==4.7.4
typing-inspection==0.4.1
typing_extensions==4.14.1
uritemplate==4.2.0
urllib3==2.5.0
xxhash==3.5.0
yarl==1.20.1
zipp==3.23.0