From ed0dc3c77796af5270ffed226b1491575dc0e0e5 Mon Sep 17 00:00:00 2001 From: jChenvan <188939308+jChenvan@users.noreply.github.com> Date: Wed, 3 Sep 2025 14:59:44 -0400 Subject: [PATCH] More console messages for Dorks crawler --- docker/crawler-google-alerts/get_all_feed_contents.py | 10 ++++++++-- docker/crawler-google-alerts/main.py | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docker/crawler-google-alerts/get_all_feed_contents.py b/docker/crawler-google-alerts/get_all_feed_contents.py index aed92a7..187f80d 100644 --- a/docker/crawler-google-alerts/get_all_feed_contents.py +++ b/docker/crawler-google-alerts/get_all_feed_contents.py @@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None: for element in main_content(['script', 'style', 'aside']): # type: ignore element.decompose() + main_text = main_content.get_text(separator='\n', strip=True) + main_text = clean_string(main_text) + print(f"SUCCESSFUL FETCH: {url}") + print(f"FETCH CONTENT: {main_text[:140]}...") # .get_text() with separator and strip for cleaner output - return main_content.get_text(separator='\n', strip=True) + return main_text else: # Fallback if no specific container is found (less reliable) print("WARNING: No main content container found. Falling back to body.") if soup.body: body_text = soup.body.get_text(separator='\n', strip=True) + body_text = clean_string(body_text) print(f"SUCCESSFUL FETCH: {url}") + print(f"FETCH CONTENT: {body_text[:140]}...") return body_text except Exception as e: @@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]: if content: pages.append({ "url": url, - "content": clean_string(content) + "content": content }) print(f"\nSuccessfully fetched {len(pages)} webpages.") diff --git a/docker/crawler-google-alerts/main.py b/docker/crawler-google-alerts/main.py index aca0b5b..16dfbb6 100644 --- a/docker/crawler-google-alerts/main.py +++ b/docker/crawler-google-alerts/main.py @@ -239,7 +239,7 @@ async def main(): # If model says 'none', skip by default (these are the irrelevant ones like US missile contracts) if relevance == "none": - print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200]) + print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx) continue # basic required-field check (we want the API-required fields present) @@ -250,7 +250,7 @@ async def main(): # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now # Save the item all_extracted_deals.append(tx) - print(f" ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore + print(f" ✔️ Kept transaction: {tx}") # type: ignore # Respect rate limit time.sleep(1)