From ed0dc3c77796af5270ffed226b1491575dc0e0e5 Mon Sep 17 00:00:00 2001
From: jChenvan <188939308+jChenvan@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:59:44 -0400
Subject: [PATCH] More console messages for Dorks crawler

---
 docker/crawler-google-alerts/get_all_feed_contents.py | 10 ++++++++--
 docker/crawler-google-alerts/main.py                  |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docker/crawler-google-alerts/get_all_feed_contents.py b/docker/crawler-google-alerts/get_all_feed_contents.py
index aed92a7..187f80d 100644
--- a/docker/crawler-google-alerts/get_all_feed_contents.py
+++ b/docker/crawler-google-alerts/get_all_feed_contents.py
@@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None:
                 for element in main_content(['script', 'style', 'aside']): # type: ignore
                     element.decompose()
 
+                main_text = main_content.get_text(separator='\n', strip=True)
+                main_text = clean_string(main_text)
+
                 print(f"SUCCESSFUL FETCH: {url}")
+                print(f"FETCH CONTENT: {main_text[:140]}...")
                 # .get_text() with separator and strip for cleaner output
-                return main_content.get_text(separator='\n', strip=True)
+                return main_text
             else:
                 # Fallback if no specific container is found (less reliable)
                 print("WARNING: No main content container found. Falling back to body.")
                 if soup.body:
                     body_text = soup.body.get_text(separator='\n', strip=True)
+                    body_text = clean_string(body_text)
                     print(f"SUCCESSFUL FETCH: {url}")
+                    print(f"FETCH CONTENT: {body_text[:140]}...")
                     return body_text
             
         except Exception as e:
@@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
         if content:
             pages.append({
                 "url": url,
-                "content": clean_string(content)
+                "content": content
             })
 
     print(f"\nSuccessfully fetched {len(pages)} webpages.")
diff --git a/docker/crawler-google-alerts/main.py b/docker/crawler-google-alerts/main.py
index aca0b5b..16dfbb6 100644
--- a/docker/crawler-google-alerts/main.py
+++ b/docker/crawler-google-alerts/main.py
@@ -239,7 +239,7 @@ async def main():
 
             # If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
             if relevance == "none":
-                print("   ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
+                print("   ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx)
                 continue
 
             # basic required-field check (we want the API-required fields present)
@@ -250,7 +250,7 @@ async def main():
             # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
             # Save the item
             all_extracted_deals.append(tx)
-            print(f"   ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore
+            print(f"   ✔️ Kept transaction: {tx}") # type: ignore
 
         # Respect rate limit
         time.sleep(1)