More console messages for Dorks crawler
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
e6100dbfd9
commit
ed0dc3c777
|
@ -127,15 +127,21 @@ async def fetch_site(url: str) -> str | None:
|
||||||
for element in main_content(['script', 'style', 'aside']): # type: ignore
|
for element in main_content(['script', 'style', 'aside']): # type: ignore
|
||||||
element.decompose()
|
element.decompose()
|
||||||
|
|
||||||
|
main_text = main_content.get_text(separator='\n', strip=True)
|
||||||
|
main_text = clean_string(main_text)
|
||||||
|
|
||||||
print(f"SUCCESSFUL FETCH: {url}")
|
print(f"SUCCESSFUL FETCH: {url}")
|
||||||
|
print(f"FETCH CONTENT: {main_text[:140]}...")
|
||||||
# .get_text() with separator and strip for cleaner output
|
# .get_text() with separator and strip for cleaner output
|
||||||
return main_content.get_text(separator='\n', strip=True)
|
return main_text
|
||||||
else:
|
else:
|
||||||
# Fallback if no specific container is found (less reliable)
|
# Fallback if no specific container is found (less reliable)
|
||||||
print("WARNING: No main content container found. Falling back to body.")
|
print("WARNING: No main content container found. Falling back to body.")
|
||||||
if soup.body:
|
if soup.body:
|
||||||
body_text = soup.body.get_text(separator='\n', strip=True)
|
body_text = soup.body.get_text(separator='\n', strip=True)
|
||||||
|
body_text = clean_string(body_text)
|
||||||
print(f"SUCCESSFUL FETCH: {url}")
|
print(f"SUCCESSFUL FETCH: {url}")
|
||||||
|
print(f"FETCH CONTENT: {body_text[:140]}...")
|
||||||
return body_text
|
return body_text
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -178,7 +184,7 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
|
||||||
if content:
|
if content:
|
||||||
pages.append({
|
pages.append({
|
||||||
"url": url,
|
"url": url,
|
||||||
"content": clean_string(content)
|
"content": content
|
||||||
})
|
})
|
||||||
|
|
||||||
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
||||||
|
|
|
@ -239,7 +239,7 @@ async def main():
|
||||||
|
|
||||||
# If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
|
# If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
|
||||||
if relevance == "none":
|
if relevance == "none":
|
||||||
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
|
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# basic required-field check (we want the API-required fields present)
|
# basic required-field check (we want the API-required fields present)
|
||||||
|
@ -250,7 +250,7 @@ async def main():
|
||||||
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
|
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
|
||||||
# Save the item
|
# Save the item
|
||||||
all_extracted_deals.append(tx)
|
all_extracted_deals.append(tx)
|
||||||
print(f" ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore
|
print(f" ✔️ Kept transaction: {tx}") # type: ignore
|
||||||
|
|
||||||
# Respect rate limit
|
# Respect rate limit
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
Loading…
Reference in New Issue