More console messages for Dorks crawler
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
e6100dbfd9
commit
ed0dc3c777
|
@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None:
|
|||
for element in main_content(['script', 'style', 'aside']): # type: ignore
|
||||
element.decompose()
|
||||
|
||||
main_text = main_content.get_text(separator='\n', strip=True)
|
||||
main_text = clean_string(main_text)
|
||||
|
||||
print(f"SUCCESSFUL FETCH: {url}")
|
||||
print(f"FETCH CONTENT: {main_text[:140]}...")
|
||||
# .get_text() with separator and strip for cleaner output
|
||||
return main_content.get_text(separator='\n', strip=True)
|
||||
return main_text
|
||||
else:
|
||||
# Fallback if no specific container is found (less reliable)
|
||||
print("WARNING: No main content container found. Falling back to body.")
|
||||
if soup.body:
|
||||
body_text = soup.body.get_text(separator='\n', strip=True)
|
||||
body_text = clean_string(body_text)
|
||||
print(f"SUCCESSFUL FETCH: {url}")
|
||||
print(f"FETCH CONTENT: {body_text[:140]}...")
|
||||
return body_text
|
||||
|
||||
except Exception as e:
|
||||
|
@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
|
|||
if content:
|
||||
pages.append({
|
||||
"url": url,
|
||||
"content": clean_string(content)
|
||||
"content": content
|
||||
})
|
||||
|
||||
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
||||
|
|
|
@ -239,7 +239,7 @@ async def main():
|
|||
|
||||
# If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
|
||||
if relevance == "none":
|
||||
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
|
||||
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx)
|
||||
continue
|
||||
|
||||
# basic required-field check (we want the API-required fields present)
|
||||
|
@ -250,7 +250,7 @@ async def main():
|
|||
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
|
||||
# Save the item
|
||||
all_extracted_deals.append(tx)
|
||||
print(f" ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore
|
||||
print(f" ✔️ Kept transaction: {tx}") # type: ignore
|
||||
|
||||
# Respect rate limit
|
||||
time.sleep(1)
|
||||
|
|
Loading…
Reference in New Issue