More console messages for Dorks crawler

2025-09-03 14:59:44 -04:00 · 2025-09-03 14:59:44 -04:00 · ed0dc3c777
parent e6100dbfd9
commit ed0dc3c777
2 changed files with 10 additions and 4 deletions
--- a/docker/crawler-google-alerts/get_all_feed_contents.py
+++ b/docker/crawler-google-alerts/get_all_feed_contents.py
@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None:
                for element in main_content(['script', 'style', 'aside']): # type: ignore
                    element.decompose()
                main_text = main_content.get_text(separator='\n', strip=True)
                main_text = clean_string(main_text)
                print(f"SUCCESSFUL FETCH: {url}")
                print(f"FETCH CONTENT: {main_text[:140]}...")
                # .get_text() with separator and strip for cleaner output
-                return main_content.get_text(separator='\n', strip=True)
+                return main_text
            else:
                # Fallback if no specific container is found (less reliable)
                print("WARNING: No main content container found. Falling back to body.")
                if soup.body:
                    body_text = soup.body.get_text(separator='\n', strip=True)
                    body_text = clean_string(body_text)
                    print(f"SUCCESSFUL FETCH: {url}")
                    print(f"FETCH CONTENT: {body_text[:140]}...")
                    return body_text
        except Exception as e:
@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
        if content:
            pages.append({
                "url": url,
-                "content": clean_string(content)
+                "content": content
            })
    print(f"\nSuccessfully fetched {len(pages)} webpages.")
--- a/docker/crawler-google-alerts/main.py
+++ b/docker/crawler-google-alerts/main.py
@ -239,7 +239,7 @@ async def main():
            # If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
            if relevance == "none":
-                print("   ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
+                print("   ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx)
                continue
            # basic required-field check (we want the API-required fields present)
@ -250,7 +250,7 @@ async def main():
            # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
            # Save the item
            all_extracted_deals.append(tx)
-            print(f"   ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore
+            print(f"   ✔️ Kept transaction: {tx}") # type: ignore
        # Respect rate limit
        time.sleep(1)