diff --git a/docker/crawler_dorks/.gitignore b/docker/crawler_dorks/.gitignore new file mode 100644 index 0000000..53d4418 --- /dev/null +++ b/docker/crawler_dorks/.gitignore @@ -0,0 +1,3 @@ +.env +page_content.json +__pycache__/ \ No newline at end of file diff --git a/docker/crawler_dorks/clean_string.py b/docker/crawler_dorks/clean_string.py new file mode 100644 index 0000000..f2554e5 --- /dev/null +++ b/docker/crawler_dorks/clean_string.py @@ -0,0 +1,24 @@ +def clean_string(input_string: str) -> str: + """ + Cleans a multi-line string by trimming whitespace and removing empty lines. + + This function takes a string, splits it into lines, removes any leading or + trailing whitespace from each line, discards any lines that become empty + after trimming, and then joins the non-empty lines back together with + a single newline character between them. + + Args: + input_string: The string to be cleaned. + + Returns: + A new string with whitespace-trimmed lines and no empty lines. + """ + # Use a list comprehension for a concise solution: + # 1. input_string.split('\n'): Splits the string into a list of lines. + # 2. line.strip(): Removes leading/trailing whitespace from each line. + # 3. if line.strip(): This condition filters out any strings that are empty + # after being stripped of whitespace. + # 4. '\n'.join(...): Joins the elements of the resulting list into a + # single string, with each element separated by a newline. + cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()] + return '\n'.join(cleaned_lines) diff --git a/docker/crawler_dorks/feeds.json b/docker/crawler_dorks/feeds.json new file mode 100644 index 0000000..e4a3446 --- /dev/null +++ b/docker/crawler_dorks/feeds.json @@ -0,0 +1,3 @@ +{ + "Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267" +} \ No newline at end of file diff --git a/docker/crawler_dorks/fetch_site.py b/docker/crawler_dorks/fetch_site.py new file mode 100644 index 0000000..8528368 --- /dev/null +++ b/docker/crawler_dorks/fetch_site.py @@ -0,0 +1,34 @@ +import asyncio +from playwright.async_api import async_playwright +from bs4 import BeautifulSoup + +async def fetch_site(url: str) -> str | None: + """ + Fetches the text content of a URL using Playwright. + + Args: + url: The URL of the website to fetch. + + Returns: + A string containing the text content of the page, or None on error. + """ + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + try: + # Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback + await page.goto(url, wait_until='domcontentloaded', timeout=60000) + + content = await page.content() + soup = BeautifulSoup(content, 'html.parser') + + # .get_text() is the standard method in modern BeautifulSoup + return soup.get_text() + + except Exception as e: + print(f"An error occurred: {e}") + return None + + finally: + await browser.close() \ No newline at end of file diff --git a/docker/crawler_dorks/get_all_feed_contents.py b/docker/crawler_dorks/get_all_feed_contents.py new file mode 100644 index 0000000..c1e2caf --- /dev/null +++ b/docker/crawler_dorks/get_all_feed_contents.py @@ -0,0 +1,31 @@ +import asyncio +import json +from clean_string import clean_string +from fetch_site import fetch_site +from get_feeds import get_feeds +from get_links_from_feed import get_links_from_feed + + +async def get_all_feed_contents(): + feeds = get_feeds() + urls = [] + for keyword, feed in feeds: + alerts = get_links_from_feed(feed) + for alert in alerts: + urls.append(alert.url) + pages = [] + for url in urls: + content = await fetch_site(url) + if not content: + continue + pages.append({ + "url": url, + "content": clean_string(content) + }) + return pages + +async def main(): + print(await get_all_feed_contents()) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docker/crawler_dorks/get_feeds.py b/docker/crawler_dorks/get_feeds.py new file mode 100644 index 0000000..6bd756e --- /dev/null +++ b/docker/crawler_dorks/get_feeds.py @@ -0,0 +1,18 @@ +import json +import os + +def get_feeds() -> list[tuple[str, str]]: + """Reads feed names and URLs from a local JSON file. + + This function opens 'feeds.json', which is expected to be in the + same directory as this script. It parses the JSON object, which + should contain string keys (feed names) and string values (URLs). + + Returns: + list[tuple[str, str]]: A list of tuples, where each tuple + contains a feed's name and its URL. + """ + file_path = os.path.join(os.path.dirname(__file__), "./feeds.json") + with open(file_path, "r") as f: + data: dict[str, str] = json.load(f) + return list(data.items()) \ No newline at end of file diff --git a/docker/crawler_dorks/get_links_from_feed.py b/docker/crawler_dorks/get_links_from_feed.py new file mode 100644 index 0000000..ce8652e --- /dev/null +++ b/docker/crawler_dorks/get_links_from_feed.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from bs4 import BeautifulSoup +import feedparser +import urllib.parse + +@dataclass +class Alert: + """A simple data class to hold information about a single alert.""" + title: str + url: str + summary: str + +def get_links_from_feed(rss_url: str) -> list[Alert]: + """ + Parses a Google Alerts RSS feed URL and extracts the data for each alert. + + Args: + rss_url: The URL of the Google Alerts RSS feed. + + Returns: + A list of Alert objects. Returns an empty list if the feed + cannot be parsed or is empty. + """ + alerts: list[Alert] = [] + # Parse the RSS feed from the provided URL + feed = feedparser.parse(rss_url) + + # Check if the feed was parsed successfully and has entries + if feed.bozo: + print(f"Error parsing feed: {feed.bozo_exception}") + return alerts + + # Iterate over each entry in the feed + for entry in feed.entries: + # The title is directly available + title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore + title = title_soup.get_text() + + # The summary often contains HTML, so we parse it to get clean text. + summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore + summary = summary_soup.get_text() + + # The link is a Google redirect URL; we extract the 'url' parameter. + link = entry.link + + try: + # Parse the URL to easily access its components + parsed_url = urllib.parse.urlparse(link) #type: ignore + # Get the query parameters as a dictionary + query_params = urllib.parse.parse_qs(parsed_url.query) + # The actual destination URL is in the 'url' parameter + actual_url = query_params.get('url', [None])[0] + + if actual_url: + # Append an Alert object instead of a tuple + alert_obj = Alert(title=title, url=actual_url, summary=summary) + alerts.append(alert_obj) + except Exception as e: + print(f"Could not parse URL for entry '{title}': {e}") + + return alerts \ No newline at end of file diff --git a/docker/crawler_dorks/main.py b/docker/crawler_dorks/main.py new file mode 100644 index 0000000..eb5afd5 --- /dev/null +++ b/docker/crawler_dorks/main.py @@ -0,0 +1,196 @@ +import asyncio +from typing import Optional +import google.generativeai as genai +import json +import os +import time +from dotenv import load_dotenv +from pydantic import BaseModel, Field +import requests + +from get_all_feed_contents import get_all_feed_contents +load_dotenv() + +GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") + +INPUT_FILE = "./page_content.json" + +MODEL_NAME = "gemini-2.0-flash-lite" + +# TODO: refine +EXTRACTION_PROMPT = """ +From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields: + + transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract") + company_division - Company or division name + recipient - Recipient of the transaction + amount - Transaction amount (defaults to 0) + description - Transaction description + address_1, address_2, city, province, region, postal_code - Address fields + source_date - Date in YYYY-MM-DD format + source_description - Source description + grant_type - Type of grant + commodity_class - Commodity classification + contract_number - Contract number + comments - Additional comments + is_primary - Boolean flag (defaults to false) + + +Do not hallucinate. If a field cannot be detemined from the text, leave it empty. + +--- +DOCUMENT TEXT: +{text_content} +""" + +SCHEMA = { + "type": "object", + "properties": { + "transaction_type": { + "type": "string", + "description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')" + }, + "company_division": { + "type": "string", + "description": "Company or division name" + }, + "recipient": { + "type": "string", + "description": "Recipient of the transaction" + }, + "amount": { + "type": "number", + "description": "Transaction amount", + }, + "description": { + "type": "string", + "description": "Transaction description" + }, + "address_1": { + "type": "string", + "description": "Address line 1" + }, + "address_2": { + "type": "string", + "description": "Address line 2" + }, + "city": { + "type": "string", + "description": "City" + }, + "province": { + "type": "string", + "description": "Province/State" + }, + "region": { + "type": "string", + "description": "Region" + }, + "postal_code": { + "type": "string", + "description": "Postal code" + }, + "source_date": { + "type": "string", + "format": "date-time", + "description": "Date in YYYY-MM-DD format" + }, + "source_description": { + "type": "string", + "description": "Source description" + }, + "grant_type": { + "type": "string", + "description": "Type of grant" + }, + "commodity_class": { + "type": "string", + "description": "Commodity classification" + }, + "contract_number": { + "type": "string", + "description": "Contract number" + }, + "comments": { + "type": "string", + "description": "Additional comments" + }, + "is_primary": { + "type": "boolean", + "description": "Boolean flag indicating if it's primary", + } + } +} + +def process_content_with_gemini(text_content): + """ + Sends the text to the Gemini API with the extraction prompt and + parses the JSON response. + """ + model = genai.GenerativeModel(MODEL_NAME) # type: ignore + prompt = EXTRACTION_PROMPT.format(text_content=text_content) + + try: + response = model.generate_content( + prompt, + generation_config={ + "response_schema": SCHEMA, + "response_mime_type": 'application/json', + } + ) + return json.loads(response.text) + except Exception as e: + print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}") + return {"error": str(e)} + + +async def main(): + """Main function to run the data extraction process.""" + if not GOOGLE_API_KEY: + print("❌ Error: GOOGLE_API_KEY environment variable not set.") + return + + genai.configure(api_key=GOOGLE_API_KEY) # type: ignore + + print("Retrieving all feed contents...") + scraped_pages = await get_all_feed_contents() + if not scraped_pages: + print("❌ Error: No scraper results found.") + return + print("✅ Successfully retrieved all feed contents.") + + all_extracted_deals = [] + total_pages = len(scraped_pages) + + print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...") + + for i, page in enumerate(scraped_pages): + print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}") + + # Avoid processing pages with very little text + if len(page.get('content', '')) < 150: + print(" ⏩ Skipping page due to insufficient content.") + continue + + extracted_info = process_content_with_gemini(page['content']) + + # Check if the extraction was successful and contains actual data + if extracted_info and "error" not in extracted_info: + if ("transaction_type" in extracted_info) and ("company_division" in extracted_info) and ("recipient" in extracted_info): + print(" ✔️ Found relevant info") + all_extracted_deals.append(extracted_info) + else: + print(" ❌ insufficient info") + print(f" Extracted info: {extracted_info}") + + # Add a small delay to respect API rate limits (1 second is safe) + time.sleep(1) + + if all_extracted_deals: + for transaction in all_extracted_deals: + requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction) + else: + print("\nNo relevant deals were extracted from any of the pages.") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docker/crawler_dorks/requirements.txt b/docker/crawler_dorks/requirements.txt new file mode 100644 index 0000000..2d2090e --- /dev/null +++ b/docker/crawler_dorks/requirements.txt @@ -0,0 +1,48 @@ +annotated-types==0.7.0 +anyio==4.9.0 +beautifulsoup4==4.13.4 +cachetools==5.5.2 +certifi==2025.7.14 +charset-normalizer==3.4.2 +colorama==0.4.6 +dnspython==2.7.0 +email_validator==2.2.0 +feedparser==6.0.11 +google-ai-generativelanguage==0.6.15 +google-api-core==2.25.1 +google-api-python-client==2.177.0 +google-auth==2.40.3 +google-auth-httplib2==0.2.0 +google-genai==1.28.0 +google-generativeai==0.8.5 +googleapis-common-protos==1.70.0 +greenlet==3.2.3 +grpcio==1.74.0 +grpcio-status==1.71.2 +h11==0.16.0 +httpcore==1.0.9 +httplib2==0.22.0 +httpx==0.28.1 +idna==3.10 +playwright==1.54.0 +proto-plus==1.26.1 +protobuf==5.29.5 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pydantic==2.11.7 +pydantic_core==2.33.2 +pyee==13.0.0 +pyparsing==3.2.3 +python-dotenv==1.1.1 +requests==2.32.4 +rsa==4.9.1 +sgmllib3k==1.0.0 +sniffio==1.3.1 +soupsieve==2.7 +tenacity==8.5.0 +tqdm==4.67.1 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +uritemplate==4.2.0 +urllib3==2.5.0 +websockets==15.0.1