More console messages for Dorks crawler
ci/woodpecker/push/woodpecker Pipeline was successful Details

This commit is contained in:
jChenvan 2025-09-03 14:59:44 -04:00
parent e6100dbfd9
commit ed0dc3c777
2 changed files with 10 additions and 4 deletions

View File

@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None:
for element in main_content(['script', 'style', 'aside']): # type: ignore
element.decompose()
main_text = main_content.get_text(separator='\n', strip=True)
main_text = clean_string(main_text)
print(f"SUCCESSFUL FETCH: {url}")
print(f"FETCH CONTENT: {main_text[:140]}...")
# .get_text() with separator and strip for cleaner output
return main_content.get_text(separator='\n', strip=True)
return main_text
else:
# Fallback if no specific container is found (less reliable)
print("WARNING: No main content container found. Falling back to body.")
if soup.body:
body_text = soup.body.get_text(separator='\n', strip=True)
body_text = clean_string(body_text)
print(f"SUCCESSFUL FETCH: {url}")
print(f"FETCH CONTENT: {body_text[:140]}...")
return body_text
except Exception as e:
@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
if content:
pages.append({
"url": url,
"content": clean_string(content)
"content": content
})
print(f"\nSuccessfully fetched {len(pages)} webpages.")

View File

@ -239,7 +239,7 @@ async def main():
# If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
if relevance == "none":
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx)
continue
# basic required-field check (we want the API-required fields present)
@ -250,7 +250,7 @@ async def main():
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
# Save the item
all_extracted_deals.append(tx)
print(f" ✔️ Kept transaction: {tx.get('company_division')}{tx.get('recipient')} ({relevance})") # type: ignore
print(f" ✔️ Kept transaction: {tx}") # type: ignore
# Respect rate limit
time.sleep(1)