add dork crawler

2025-08-07 17:16:02 -04:00 · 2025-08-07 17:16:02 -04:00 · c11ec3f09c
parent f06d01613f
commit c11ec3f09c
9 changed files with 418 additions and 0 deletions
--- a/docker/crawler_dorks/.gitignore
+++ b/docker/crawler_dorks/.gitignore
@ -0,0 +1,3 @@
 .env
 page_content.json
 __pycache__/
--- a/docker/crawler_dorks/clean_string.py
+++ b/docker/crawler_dorks/clean_string.py
@ -0,0 +1,24 @@
 def clean_string(input_string: str) -> str:
  """
  Cleans a multi-line string by trimming whitespace and removing empty lines.
  This function takes a string, splits it into lines, removes any leading or
  trailing whitespace from each line, discards any lines that become empty
  after trimming, and then joins the non-empty lines back together with
  a single newline character between them.
  Args:
    input_string: The string to be cleaned.
  Returns:
    A new string with whitespace-trimmed lines and no empty lines.
  """
  # Use a list comprehension for a concise solution:
  # 1. input_string.split('\n'): Splits the string into a list of lines.
  # 2. line.strip(): Removes leading/trailing whitespace from each line.
  # 3. if line.strip(): This condition filters out any strings that are empty
  #    after being stripped of whitespace.
  # 4. '\n'.join(...): Joins the elements of the resulting list into a
  #    single string, with each element separated by a newline.
  cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()]
  return '\n'.join(cleaned_lines)
--- a/docker/crawler_dorks/feeds.json
+++ b/docker/crawler_dorks/feeds.json
@ -0,0 +1,3 @@
 {
    "Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267"
 }
--- a/docker/crawler_dorks/fetch_site.py
+++ b/docker/crawler_dorks/fetch_site.py
@ -0,0 +1,34 @@
 import asyncio
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 async def fetch_site(url: str) -> str | None:
    """
    Fetches the text content of a URL using Playwright.
    Args:
        url: The URL of the website to fetch.
    Returns:
        A string containing the text content of the page, or None on error.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        try:
            # Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
            content = await page.content()
            soup = BeautifulSoup(content, 'html.parser')
            # .get_text() is the standard method in modern BeautifulSoup
            return soup.get_text()
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
        finally:
            await browser.close()
--- a/docker/crawler_dorks/get_all_feed_contents.py
+++ b/docker/crawler_dorks/get_all_feed_contents.py
@ -0,0 +1,31 @@
 import asyncio
 import json
 from clean_string import clean_string
 from fetch_site import fetch_site
 from get_feeds import get_feeds
 from get_links_from_feed import get_links_from_feed
 async def get_all_feed_contents():
    feeds = get_feeds()
    urls = []
    for keyword, feed in feeds:
        alerts = get_links_from_feed(feed)
        for alert in alerts:
            urls.append(alert.url)
    pages = []
    for url in urls:
        content = await fetch_site(url)
        if not content:
            continue
        pages.append({
            "url": url,
            "content": clean_string(content)
        })
    return pages
 async def main():
    print(await get_all_feed_contents())
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docker/crawler_dorks/get_feeds.py
+++ b/docker/crawler_dorks/get_feeds.py
@ -0,0 +1,18 @@
 import json
 import os
 def get_feeds() -> list[tuple[str, str]]:
    """Reads feed names and URLs from a local JSON file.
    This function opens 'feeds.json', which is expected to be in the
    same directory as this script. It parses the JSON object, which
    should contain string keys (feed names) and string values (URLs).
    Returns:
        list[tuple[str, str]]: A list of tuples, where each tuple
                               contains a feed's name and its URL.
    """
    file_path = os.path.join(os.path.dirname(__file__), "./feeds.json")
    with open(file_path, "r") as f:
        data: dict[str, str] = json.load(f)
    return list(data.items())
--- a/docker/crawler_dorks/get_links_from_feed.py
+++ b/docker/crawler_dorks/get_links_from_feed.py
@ -0,0 +1,61 @@
 from dataclasses import dataclass
 from bs4 import BeautifulSoup
 import feedparser
 import urllib.parse
@dataclass
 class Alert:
    """A simple data class to hold information about a single alert."""
    title: str
    url: str
    summary: str
 def get_links_from_feed(rss_url: str) -> list[Alert]:
    """
    Parses a Google Alerts RSS feed URL and extracts the data for each alert.
    Args:
        rss_url: The URL of the Google Alerts RSS feed.
    Returns:
        A list of Alert objects. Returns an empty list if the feed 
        cannot be parsed or is empty.
    """
    alerts: list[Alert] = []
    # Parse the RSS feed from the provided URL
    feed = feedparser.parse(rss_url)
    # Check if the feed was parsed successfully and has entries
    if feed.bozo:
        print(f"Error parsing feed: {feed.bozo_exception}")
        return alerts
    # Iterate over each entry in the feed
    for entry in feed.entries:
        # The title is directly available
        title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
        title = title_soup.get_text()
        # The summary often contains HTML, so we parse it to get clean text.
        summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
        summary = summary_soup.get_text()
        # The link is a Google redirect URL; we extract the 'url' parameter.
        link = entry.link
        try:
            # Parse the URL to easily access its components
            parsed_url = urllib.parse.urlparse(link) #type: ignore
            # Get the query parameters as a dictionary
            query_params = urllib.parse.parse_qs(parsed_url.query)
            # The actual destination URL is in the 'url' parameter
            actual_url = query_params.get('url', [None])[0]
            if actual_url:
                # Append an Alert object instead of a tuple
                alert_obj = Alert(title=title, url=actual_url, summary=summary)
                alerts.append(alert_obj)
        except Exception as e:
            print(f"Could not parse URL for entry '{title}': {e}")
    return alerts
--- a/docker/crawler_dorks/main.py
+++ b/docker/crawler_dorks/main.py
@ -0,0 +1,196 @@
 import asyncio
 from typing import Optional
 import google.generativeai as genai
 import json
 import os
 import time
 from dotenv import load_dotenv
 from pydantic import BaseModel, Field
 import requests
 from get_all_feed_contents import get_all_feed_contents
 load_dotenv()
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 INPUT_FILE = "./page_content.json"
 MODEL_NAME = "gemini-2.0-flash-lite"
 # TODO: refine
 EXTRACTION_PROMPT = """
 From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields:
    transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract")
    company_division - Company or division name
    recipient - Recipient of the transaction
    amount - Transaction amount (defaults to 0)
    description - Transaction description
    address_1, address_2, city, province, region, postal_code - Address fields
    source_date - Date in YYYY-MM-DD format
    source_description - Source description
    grant_type - Type of grant
    commodity_class - Commodity classification
    contract_number - Contract number
    comments - Additional comments
    is_primary - Boolean flag (defaults to false)
 Do not hallucinate. If a field cannot be detemined from the text, leave it empty.
 ---
 DOCUMENT TEXT:
 {text_content}
 """
 SCHEMA = {
  "type": "object",
  "properties": {
    "transaction_type": {
      "type": "string",
      "description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')"
    },
    "company_division": {
      "type": "string",
      "description": "Company or division name"
    },
    "recipient": {
      "type": "string",
      "description": "Recipient of the transaction"
    },
    "amount": {
      "type": "number",
      "description": "Transaction amount",
    },
    "description": {
      "type": "string",
      "description": "Transaction description"
    },
    "address_1": {
      "type": "string",
      "description": "Address line 1"
    },
    "address_2": {
      "type": "string",
      "description": "Address line 2"
    },
    "city": {
      "type": "string",
      "description": "City"
    },
    "province": {
      "type": "string",
      "description": "Province/State"
    },
    "region": {
      "type": "string",
      "description": "Region"
    },
    "postal_code": {
      "type": "string",
      "description": "Postal code"
    },
    "source_date": {
      "type": "string",
      "format": "date-time",
      "description": "Date in YYYY-MM-DD format"
    },
    "source_description": {
      "type": "string",
      "description": "Source description"
    },
    "grant_type": {
      "type": "string",
      "description": "Type of grant"
    },
    "commodity_class": {
      "type": "string",
      "description": "Commodity classification"
    },
    "contract_number": {
      "type": "string",
      "description": "Contract number"
    },
    "comments": {
      "type": "string",
      "description": "Additional comments"
    },
    "is_primary": {
      "type": "boolean",
      "description": "Boolean flag indicating if it's primary",
    }
  }
 }
 def process_content_with_gemini(text_content):
    """
    Sends the text to the Gemini API with the extraction prompt and
    parses the JSON response.
    """
    model = genai.GenerativeModel(MODEL_NAME) # type: ignore
    prompt = EXTRACTION_PROMPT.format(text_content=text_content)
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "response_schema": SCHEMA,
                "response_mime_type": 'application/json',
            }
            )
        return json.loads(response.text)
    except Exception as e:
        print(f"   ❌ An error occurred while calling Gemini or parsing its response: {e}")
        return {"error": str(e)}
 async def main():
    """Main function to run the data extraction process."""
    if not GOOGLE_API_KEY:
        print("❌ Error: GOOGLE_API_KEY environment variable not set.")
        return
    genai.configure(api_key=GOOGLE_API_KEY) # type: ignore
    print("Retrieving all feed contents...")
    scraped_pages = await get_all_feed_contents()
    if not scraped_pages:
        print("❌ Error: No scraper results found.")
        return
    print("✅ Successfully retrieved all feed contents.")
    all_extracted_deals = []
    total_pages = len(scraped_pages)
    print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
    for i, page in enumerate(scraped_pages):
        print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
        # Avoid processing pages with very little text
        if len(page.get('content', '')) < 150:
            print("   ⏩ Skipping page due to insufficient content.")
            continue
        extracted_info = process_content_with_gemini(page['content'])
        # Check if the extraction was successful and contains actual data
        if extracted_info and "error" not in extracted_info:
            if ("transaction_type" in extracted_info) and  ("company_division" in extracted_info) and ("recipient" in extracted_info):
                print("   ✔️ Found relevant info")
                all_extracted_deals.append(extracted_info)
            else:
                print("   ❌ insufficient info")
                print(f"   Extracted info: {extracted_info}")
        # Add a small delay to respect API rate limits (1 second is safe)
        time.sleep(1)
    if all_extracted_deals:
        for transaction in all_extracted_deals:
            requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction)
    else:
        print("\nNo relevant deals were extracted from any of the pages.")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docker/crawler_dorks/requirements.txt
+++ b/docker/crawler_dorks/requirements.txt
@ -0,0 +1,48 @@
 annotated-types==0.7.0
 anyio==4.9.0
 beautifulsoup4==4.13.4
 cachetools==5.5.2
 certifi==2025.7.14
 charset-normalizer==3.4.2
 colorama==0.4.6
 dnspython==2.7.0
 email_validator==2.2.0
 feedparser==6.0.11
 google-ai-generativelanguage==0.6.15
 google-api-core==2.25.1
 google-api-python-client==2.177.0
 google-auth==2.40.3
 google-auth-httplib2==0.2.0
 google-genai==1.28.0
 google-generativeai==0.8.5
 googleapis-common-protos==1.70.0
 greenlet==3.2.3
 grpcio==1.74.0
 grpcio-status==1.71.2
 h11==0.16.0
 httpcore==1.0.9
 httplib2==0.22.0
 httpx==0.28.1
 idna==3.10
 playwright==1.54.0
 proto-plus==1.26.1
 protobuf==5.29.5
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
 pydantic==2.11.7
 pydantic_core==2.33.2
 pyee==13.0.0
 pyparsing==3.2.3
 python-dotenv==1.1.1
 requests==2.32.4
 rsa==4.9.1
 sgmllib3k==1.0.0
 sniffio==1.3.1
 soupsieve==2.7
 tenacity==8.5.0
 tqdm==4.67.1
 typing-inspection==0.4.1
 typing_extensions==4.14.1
 uritemplate==4.2.0
 urllib3==2.5.0
 websockets==15.0.1