add dork crawler

2025-08-07 17:16:02 -04:00 · 2025-08-07 17:16:02 -04:00 · c11ec3f09c
parent f06d01613f
commit c11ec3f09c
9 changed files with 418 additions and 0 deletions
--- a/docker/crawler_dorks/.gitignore
+++ b/docker/crawler_dorks/.gitignore
@ -0,0 +1,3 @@
+.env
+page_content.json
+__pycache__/
--- a/docker/crawler_dorks/clean_string.py
+++ b/docker/crawler_dorks/clean_string.py
@ -0,0 +1,24 @@
+def clean_string(input_string: str) -> str:
+  """
+  Cleans a multi-line string by trimming whitespace and removing empty lines.
+
+  This function takes a string, splits it into lines, removes any leading or
+  trailing whitespace from each line, discards any lines that become empty
+  after trimming, and then joins the non-empty lines back together with
+  a single newline character between them.
+
+  Args:
+    input_string: The string to be cleaned.
+
+  Returns:
+    A new string with whitespace-trimmed lines and no empty lines.
+  """
+  # Use a list comprehension for a concise solution:
+  # 1. input_string.split('\n'): Splits the string into a list of lines.
+  # 2. line.strip(): Removes leading/trailing whitespace from each line.
+  # 3. if line.strip(): This condition filters out any strings that are empty
+  #    after being stripped of whitespace.
+  # 4. '\n'.join(...): Joins the elements of the resulting list into a
+  #    single string, with each element separated by a newline.
+  cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()]
+  return '\n'.join(cleaned_lines)
--- a/docker/crawler_dorks/feeds.json
+++ b/docker/crawler_dorks/feeds.json
@ -0,0 +1,3 @@
+{
+    "Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267"
+}
--- a/docker/crawler_dorks/fetch_site.py
+++ b/docker/crawler_dorks/fetch_site.py
@ -0,0 +1,34 @@
+import asyncio
+from playwright.async_api import async_playwright
+from bs4 import BeautifulSoup
+
+async def fetch_site(url: str) -> str | None:
+    """
+    Fetches the text content of a URL using Playwright.
+
+    Args:
+        url: The URL of the website to fetch.
+
+    Returns:
+        A string containing the text content of the page, or None on error.
+    """
+    async with async_playwright() as p:
+        browser = await p.chromium.launch()
+        page = await browser.new_page()
+        
+        try:
+            # Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback
+            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
+            
+            content = await page.content()
+            soup = BeautifulSoup(content, 'html.parser')
+            
+            # .get_text() is the standard method in modern BeautifulSoup
+            return soup.get_text()
+            
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return None
+            
+        finally:
+            await browser.close()
--- a/docker/crawler_dorks/get_all_feed_contents.py
+++ b/docker/crawler_dorks/get_all_feed_contents.py
@ -0,0 +1,31 @@
+import asyncio
+import json
+from clean_string import clean_string
+from fetch_site import fetch_site
+from get_feeds import get_feeds
+from get_links_from_feed import get_links_from_feed
+
+
+async def get_all_feed_contents():
+    feeds = get_feeds()
+    urls = []
+    for keyword, feed in feeds:
+        alerts = get_links_from_feed(feed)
+        for alert in alerts:
+            urls.append(alert.url)
+    pages = []
+    for url in urls:
+        content = await fetch_site(url)
+        if not content:
+            continue
+        pages.append({
+            "url": url,
+            "content": clean_string(content)
+        })
+    return pages
+
+async def main():
+    print(await get_all_feed_contents())
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docker/crawler_dorks/get_feeds.py
+++ b/docker/crawler_dorks/get_feeds.py
@ -0,0 +1,18 @@
+import json
+import os
+
+def get_feeds() -> list[tuple[str, str]]:
+    """Reads feed names and URLs from a local JSON file.
+
+    This function opens 'feeds.json', which is expected to be in the
+    same directory as this script. It parses the JSON object, which
+    should contain string keys (feed names) and string values (URLs).
+
+    Returns:
+        list[tuple[str, str]]: A list of tuples, where each tuple
+                               contains a feed's name and its URL.
+    """
+    file_path = os.path.join(os.path.dirname(__file__), "./feeds.json")
+    with open(file_path, "r") as f:
+        data: dict[str, str] = json.load(f)
+    return list(data.items())
--- a/docker/crawler_dorks/get_links_from_feed.py
+++ b/docker/crawler_dorks/get_links_from_feed.py
@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from bs4 import BeautifulSoup
+import feedparser
+import urllib.parse
+
+@dataclass
+class Alert:
+    """A simple data class to hold information about a single alert."""
+    title: str
+    url: str
+    summary: str
+
+def get_links_from_feed(rss_url: str) -> list[Alert]:
+    """
+    Parses a Google Alerts RSS feed URL and extracts the data for each alert.
+
+    Args:
+        rss_url: The URL of the Google Alerts RSS feed.
+
+    Returns:
+        A list of Alert objects. Returns an empty list if the feed 
+        cannot be parsed or is empty.
+    """
+    alerts: list[Alert] = []
+    # Parse the RSS feed from the provided URL
+    feed = feedparser.parse(rss_url)
+
+    # Check if the feed was parsed successfully and has entries
+    if feed.bozo:
+        print(f"Error parsing feed: {feed.bozo_exception}")
+        return alerts
+
+    # Iterate over each entry in the feed
+    for entry in feed.entries:
+        # The title is directly available
+        title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
+        title = title_soup.get_text()
+
+        # The summary often contains HTML, so we parse it to get clean text.
+        summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
+        summary = summary_soup.get_text()
+
+        # The link is a Google redirect URL; we extract the 'url' parameter.
+        link = entry.link
+
+        try:
+            # Parse the URL to easily access its components
+            parsed_url = urllib.parse.urlparse(link) #type: ignore
+            # Get the query parameters as a dictionary
+            query_params = urllib.parse.parse_qs(parsed_url.query)
+            # The actual destination URL is in the 'url' parameter
+            actual_url = query_params.get('url', [None])[0]
+
+            if actual_url:
+                # Append an Alert object instead of a tuple
+                alert_obj = Alert(title=title, url=actual_url, summary=summary)
+                alerts.append(alert_obj)
+        except Exception as e:
+            print(f"Could not parse URL for entry '{title}': {e}")
+
+    return alerts
--- a/docker/crawler_dorks/main.py
+++ b/docker/crawler_dorks/main.py
@ -0,0 +1,196 @@
+import asyncio
+from typing import Optional
+import google.generativeai as genai
+import json
+import os
+import time
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+import requests
+
+from get_all_feed_contents import get_all_feed_contents
+load_dotenv()
+
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+
+INPUT_FILE = "./page_content.json"
+
+MODEL_NAME = "gemini-2.0-flash-lite"
+
+# TODO: refine
+EXTRACTION_PROMPT = """
+From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields:
+
+    transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract")
+    company_division - Company or division name
+    recipient - Recipient of the transaction
+    amount - Transaction amount (defaults to 0)
+    description - Transaction description
+    address_1, address_2, city, province, region, postal_code - Address fields
+    source_date - Date in YYYY-MM-DD format
+    source_description - Source description
+    grant_type - Type of grant
+    commodity_class - Commodity classification
+    contract_number - Contract number
+    comments - Additional comments
+    is_primary - Boolean flag (defaults to false)
+
+
+Do not hallucinate. If a field cannot be detemined from the text, leave it empty.
+
+---
+DOCUMENT TEXT:
+{text_content}
+"""
+
+SCHEMA = {
+  "type": "object",
+  "properties": {
+    "transaction_type": {
+      "type": "string",
+      "description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')"
+    },
+    "company_division": {
+      "type": "string",
+      "description": "Company or division name"
+    },
+    "recipient": {
+      "type": "string",
+      "description": "Recipient of the transaction"
+    },
+    "amount": {
+      "type": "number",
+      "description": "Transaction amount",
+    },
+    "description": {
+      "type": "string",
+      "description": "Transaction description"
+    },
+    "address_1": {
+      "type": "string",
+      "description": "Address line 1"
+    },
+    "address_2": {
+      "type": "string",
+      "description": "Address line 2"
+    },
+    "city": {
+      "type": "string",
+      "description": "City"
+    },
+    "province": {
+      "type": "string",
+      "description": "Province/State"
+    },
+    "region": {
+      "type": "string",
+      "description": "Region"
+    },
+    "postal_code": {
+      "type": "string",
+      "description": "Postal code"
+    },
+    "source_date": {
+      "type": "string",
+      "format": "date-time",
+      "description": "Date in YYYY-MM-DD format"
+    },
+    "source_description": {
+      "type": "string",
+      "description": "Source description"
+    },
+    "grant_type": {
+      "type": "string",
+      "description": "Type of grant"
+    },
+    "commodity_class": {
+      "type": "string",
+      "description": "Commodity classification"
+    },
+    "contract_number": {
+      "type": "string",
+      "description": "Contract number"
+    },
+    "comments": {
+      "type": "string",
+      "description": "Additional comments"
+    },
+    "is_primary": {
+      "type": "boolean",
+      "description": "Boolean flag indicating if it's primary",
+    }
+  }
+}
+
+def process_content_with_gemini(text_content):
+    """
+    Sends the text to the Gemini API with the extraction prompt and
+    parses the JSON response.
+    """
+    model = genai.GenerativeModel(MODEL_NAME) # type: ignore
+    prompt = EXTRACTION_PROMPT.format(text_content=text_content)
+
+    try:
+        response = model.generate_content(
+            prompt,
+            generation_config={
+                "response_schema": SCHEMA,
+                "response_mime_type": 'application/json',
+            }
+            )
+        return json.loads(response.text)
+    except Exception as e:
+        print(f"   ❌ An error occurred while calling Gemini or parsing its response: {e}")
+        return {"error": str(e)}
+
+
+async def main():
+    """Main function to run the data extraction process."""
+    if not GOOGLE_API_KEY:
+        print("❌ Error: GOOGLE_API_KEY environment variable not set.")
+        return
+
+    genai.configure(api_key=GOOGLE_API_KEY) # type: ignore
+
+    print("Retrieving all feed contents...")
+    scraped_pages = await get_all_feed_contents()
+    if not scraped_pages:
+        print("❌ Error: No scraper results found.")
+        return
+    print("✅ Successfully retrieved all feed contents.")
+
+    all_extracted_deals = []
+    total_pages = len(scraped_pages)
+
+    print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
+
+    for i, page in enumerate(scraped_pages):
+        print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
+
+        # Avoid processing pages with very little text
+        if len(page.get('content', '')) < 150:
+            print("   ⏩ Skipping page due to insufficient content.")
+            continue
+
+        extracted_info = process_content_with_gemini(page['content'])
+        
+        # Check if the extraction was successful and contains actual data
+        if extracted_info and "error" not in extracted_info:
+            if ("transaction_type" in extracted_info) and  ("company_division" in extracted_info) and ("recipient" in extracted_info):
+                print("   ✔️ Found relevant info")
+                all_extracted_deals.append(extracted_info)
+            else:
+                print("   ❌ insufficient info")
+                print(f"   Extracted info: {extracted_info}")
+        
+        # Add a small delay to respect API rate limits (1 second is safe)
+        time.sleep(1)
+
+    if all_extracted_deals:
+        for transaction in all_extracted_deals:
+            requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction)
+    else:
+        print("\nNo relevant deals were extracted from any of the pages.")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docker/crawler_dorks/requirements.txt
+++ b/docker/crawler_dorks/requirements.txt
@ -0,0 +1,48 @@
+annotated-types==0.7.0
+anyio==4.9.0
+beautifulsoup4==4.13.4
+cachetools==5.5.2
+certifi==2025.7.14
+charset-normalizer==3.4.2
+colorama==0.4.6
+dnspython==2.7.0
+email_validator==2.2.0
+feedparser==6.0.11
+google-ai-generativelanguage==0.6.15
+google-api-core==2.25.1
+google-api-python-client==2.177.0
+google-auth==2.40.3
+google-auth-httplib2==0.2.0
+google-genai==1.28.0
+google-generativeai==0.8.5
+googleapis-common-protos==1.70.0
+greenlet==3.2.3
+grpcio==1.74.0
+grpcio-status==1.71.2
+h11==0.16.0
+httpcore==1.0.9
+httplib2==0.22.0
+httpx==0.28.1
+idna==3.10
+playwright==1.54.0
+proto-plus==1.26.1
+protobuf==5.29.5
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pydantic==2.11.7
+pydantic_core==2.33.2
+pyee==13.0.0
+pyparsing==3.2.3
+python-dotenv==1.1.1
+requests==2.32.4
+rsa==4.9.1
+sgmllib3k==1.0.0
+sniffio==1.3.1
+soupsieve==2.7
+tenacity==8.5.0
+tqdm==4.67.1
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+uritemplate==4.2.0
+urllib3==2.5.0
+websockets==15.0.1