add dork crawler
ci/woodpecker/push/woodpecker Pipeline was successful Details

This commit is contained in:
jChenvan 2025-08-07 17:16:02 -04:00
parent f06d01613f
commit c11ec3f09c
9 changed files with 418 additions and 0 deletions

3
docker/crawler_dorks/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
.env
page_content.json
__pycache__/

View File

@ -0,0 +1,24 @@
def clean_string(input_string: str) -> str:
"""
Cleans a multi-line string by trimming whitespace and removing empty lines.
This function takes a string, splits it into lines, removes any leading or
trailing whitespace from each line, discards any lines that become empty
after trimming, and then joins the non-empty lines back together with
a single newline character between them.
Args:
input_string: The string to be cleaned.
Returns:
A new string with whitespace-trimmed lines and no empty lines.
"""
# Use a list comprehension for a concise solution:
# 1. input_string.split('\n'): Splits the string into a list of lines.
# 2. line.strip(): Removes leading/trailing whitespace from each line.
# 3. if line.strip(): This condition filters out any strings that are empty
# after being stripped of whitespace.
# 4. '\n'.join(...): Joins the elements of the resulting list into a
# single string, with each element separated by a newline.
cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()]
return '\n'.join(cleaned_lines)

View File

@ -0,0 +1,3 @@
{
"Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267"
}

View File

@ -0,0 +1,34 @@
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
async def fetch_site(url: str) -> str | None:
"""
Fetches the text content of a URL using Playwright.
Args:
url: The URL of the website to fetch.
Returns:
A string containing the text content of the page, or None on error.
"""
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
try:
# Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
content = await page.content()
soup = BeautifulSoup(content, 'html.parser')
# .get_text() is the standard method in modern BeautifulSoup
return soup.get_text()
except Exception as e:
print(f"An error occurred: {e}")
return None
finally:
await browser.close()

View File

@ -0,0 +1,31 @@
import asyncio
import json
from clean_string import clean_string
from fetch_site import fetch_site
from get_feeds import get_feeds
from get_links_from_feed import get_links_from_feed
async def get_all_feed_contents():
feeds = get_feeds()
urls = []
for keyword, feed in feeds:
alerts = get_links_from_feed(feed)
for alert in alerts:
urls.append(alert.url)
pages = []
for url in urls:
content = await fetch_site(url)
if not content:
continue
pages.append({
"url": url,
"content": clean_string(content)
})
return pages
async def main():
print(await get_all_feed_contents())
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,18 @@
import json
import os
def get_feeds() -> list[tuple[str, str]]:
"""Reads feed names and URLs from a local JSON file.
This function opens 'feeds.json', which is expected to be in the
same directory as this script. It parses the JSON object, which
should contain string keys (feed names) and string values (URLs).
Returns:
list[tuple[str, str]]: A list of tuples, where each tuple
contains a feed's name and its URL.
"""
file_path = os.path.join(os.path.dirname(__file__), "./feeds.json")
with open(file_path, "r") as f:
data: dict[str, str] = json.load(f)
return list(data.items())

View File

@ -0,0 +1,61 @@
from dataclasses import dataclass
from bs4 import BeautifulSoup
import feedparser
import urllib.parse
@dataclass
class Alert:
"""A simple data class to hold information about a single alert."""
title: str
url: str
summary: str
def get_links_from_feed(rss_url: str) -> list[Alert]:
"""
Parses a Google Alerts RSS feed URL and extracts the data for each alert.
Args:
rss_url: The URL of the Google Alerts RSS feed.
Returns:
A list of Alert objects. Returns an empty list if the feed
cannot be parsed or is empty.
"""
alerts: list[Alert] = []
# Parse the RSS feed from the provided URL
feed = feedparser.parse(rss_url)
# Check if the feed was parsed successfully and has entries
if feed.bozo:
print(f"Error parsing feed: {feed.bozo_exception}")
return alerts
# Iterate over each entry in the feed
for entry in feed.entries:
# The title is directly available
title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
title = title_soup.get_text()
# The summary often contains HTML, so we parse it to get clean text.
summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
summary = summary_soup.get_text()
# The link is a Google redirect URL; we extract the 'url' parameter.
link = entry.link
try:
# Parse the URL to easily access its components
parsed_url = urllib.parse.urlparse(link) #type: ignore
# Get the query parameters as a dictionary
query_params = urllib.parse.parse_qs(parsed_url.query)
# The actual destination URL is in the 'url' parameter
actual_url = query_params.get('url', [None])[0]
if actual_url:
# Append an Alert object instead of a tuple
alert_obj = Alert(title=title, url=actual_url, summary=summary)
alerts.append(alert_obj)
except Exception as e:
print(f"Could not parse URL for entry '{title}': {e}")
return alerts

View File

@ -0,0 +1,196 @@
import asyncio
from typing import Optional
import google.generativeai as genai
import json
import os
import time
from dotenv import load_dotenv
from pydantic import BaseModel, Field
import requests
from get_all_feed_contents import get_all_feed_contents
load_dotenv()
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
INPUT_FILE = "./page_content.json"
MODEL_NAME = "gemini-2.0-flash-lite"
# TODO: refine
EXTRACTION_PROMPT = """
From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields:
transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract")
company_division - Company or division name
recipient - Recipient of the transaction
amount - Transaction amount (defaults to 0)
description - Transaction description
address_1, address_2, city, province, region, postal_code - Address fields
source_date - Date in YYYY-MM-DD format
source_description - Source description
grant_type - Type of grant
commodity_class - Commodity classification
contract_number - Contract number
comments - Additional comments
is_primary - Boolean flag (defaults to false)
Do not hallucinate. If a field cannot be detemined from the text, leave it empty.
---
DOCUMENT TEXT:
{text_content}
"""
SCHEMA = {
"type": "object",
"properties": {
"transaction_type": {
"type": "string",
"description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')"
},
"company_division": {
"type": "string",
"description": "Company or division name"
},
"recipient": {
"type": "string",
"description": "Recipient of the transaction"
},
"amount": {
"type": "number",
"description": "Transaction amount",
},
"description": {
"type": "string",
"description": "Transaction description"
},
"address_1": {
"type": "string",
"description": "Address line 1"
},
"address_2": {
"type": "string",
"description": "Address line 2"
},
"city": {
"type": "string",
"description": "City"
},
"province": {
"type": "string",
"description": "Province/State"
},
"region": {
"type": "string",
"description": "Region"
},
"postal_code": {
"type": "string",
"description": "Postal code"
},
"source_date": {
"type": "string",
"format": "date-time",
"description": "Date in YYYY-MM-DD format"
},
"source_description": {
"type": "string",
"description": "Source description"
},
"grant_type": {
"type": "string",
"description": "Type of grant"
},
"commodity_class": {
"type": "string",
"description": "Commodity classification"
},
"contract_number": {
"type": "string",
"description": "Contract number"
},
"comments": {
"type": "string",
"description": "Additional comments"
},
"is_primary": {
"type": "boolean",
"description": "Boolean flag indicating if it's primary",
}
}
}
def process_content_with_gemini(text_content):
"""
Sends the text to the Gemini API with the extraction prompt and
parses the JSON response.
"""
model = genai.GenerativeModel(MODEL_NAME) # type: ignore
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
try:
response = model.generate_content(
prompt,
generation_config={
"response_schema": SCHEMA,
"response_mime_type": 'application/json',
}
)
return json.loads(response.text)
except Exception as e:
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
return {"error": str(e)}
async def main():
"""Main function to run the data extraction process."""
if not GOOGLE_API_KEY:
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
return
genai.configure(api_key=GOOGLE_API_KEY) # type: ignore
print("Retrieving all feed contents...")
scraped_pages = await get_all_feed_contents()
if not scraped_pages:
print("❌ Error: No scraper results found.")
return
print("✅ Successfully retrieved all feed contents.")
all_extracted_deals = []
total_pages = len(scraped_pages)
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
for i, page in enumerate(scraped_pages):
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
# Avoid processing pages with very little text
if len(page.get('content', '')) < 150:
print(" ⏩ Skipping page due to insufficient content.")
continue
extracted_info = process_content_with_gemini(page['content'])
# Check if the extraction was successful and contains actual data
if extracted_info and "error" not in extracted_info:
if ("transaction_type" in extracted_info) and ("company_division" in extracted_info) and ("recipient" in extracted_info):
print(" ✔️ Found relevant info")
all_extracted_deals.append(extracted_info)
else:
print(" ❌ insufficient info")
print(f" Extracted info: {extracted_info}")
# Add a small delay to respect API rate limits (1 second is safe)
time.sleep(1)
if all_extracted_deals:
for transaction in all_extracted_deals:
requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction)
else:
print("\nNo relevant deals were extracted from any of the pages.")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,48 @@
annotated-types==0.7.0
anyio==4.9.0
beautifulsoup4==4.13.4
cachetools==5.5.2
certifi==2025.7.14
charset-normalizer==3.4.2
colorama==0.4.6
dnspython==2.7.0
email_validator==2.2.0
feedparser==6.0.11
google-ai-generativelanguage==0.6.15
google-api-core==2.25.1
google-api-python-client==2.177.0
google-auth==2.40.3
google-auth-httplib2==0.2.0
google-genai==1.28.0
google-generativeai==0.8.5
googleapis-common-protos==1.70.0
greenlet==3.2.3
grpcio==1.74.0
grpcio-status==1.71.2
h11==0.16.0
httpcore==1.0.9
httplib2==0.22.0
httpx==0.28.1
idna==3.10
playwright==1.54.0
proto-plus==1.26.1
protobuf==5.29.5
pyasn1==0.6.1
pyasn1_modules==0.4.2
pydantic==2.11.7
pydantic_core==2.33.2
pyee==13.0.0
pyparsing==3.2.3
python-dotenv==1.1.1
requests==2.32.4
rsa==4.9.1
sgmllib3k==1.0.0
sniffio==1.3.1
soupsieve==2.7
tenacity==8.5.0
tqdm==4.67.1
typing-inspection==0.4.1
typing_extensions==4.14.1
uritemplate==4.2.0
urllib3==2.5.0
websockets==15.0.1