add dork crawler
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
f06d01613f
commit
c11ec3f09c
|
@ -0,0 +1,3 @@
|
||||||
|
.env
|
||||||
|
page_content.json
|
||||||
|
__pycache__/
|
|
@ -0,0 +1,24 @@
|
||||||
|
def clean_string(input_string: str) -> str:
|
||||||
|
"""
|
||||||
|
Cleans a multi-line string by trimming whitespace and removing empty lines.
|
||||||
|
|
||||||
|
This function takes a string, splits it into lines, removes any leading or
|
||||||
|
trailing whitespace from each line, discards any lines that become empty
|
||||||
|
after trimming, and then joins the non-empty lines back together with
|
||||||
|
a single newline character between them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_string: The string to be cleaned.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A new string with whitespace-trimmed lines and no empty lines.
|
||||||
|
"""
|
||||||
|
# Use a list comprehension for a concise solution:
|
||||||
|
# 1. input_string.split('\n'): Splits the string into a list of lines.
|
||||||
|
# 2. line.strip(): Removes leading/trailing whitespace from each line.
|
||||||
|
# 3. if line.strip(): This condition filters out any strings that are empty
|
||||||
|
# after being stripped of whitespace.
|
||||||
|
# 4. '\n'.join(...): Joins the elements of the resulting list into a
|
||||||
|
# single string, with each element separated by a newline.
|
||||||
|
cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()]
|
||||||
|
return '\n'.join(cleaned_lines)
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267"
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
async def fetch_site(url: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Fetches the text content of a URL using Playwright.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL of the website to fetch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string containing the text content of the page, or None on error.
|
||||||
|
"""
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch()
|
||||||
|
page = await browser.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback
|
||||||
|
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||||
|
|
||||||
|
content = await page.content()
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
|
# .get_text() is the standard method in modern BeautifulSoup
|
||||||
|
return soup.get_text()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await browser.close()
|
|
@ -0,0 +1,31 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from clean_string import clean_string
|
||||||
|
from fetch_site import fetch_site
|
||||||
|
from get_feeds import get_feeds
|
||||||
|
from get_links_from_feed import get_links_from_feed
|
||||||
|
|
||||||
|
|
||||||
|
async def get_all_feed_contents():
|
||||||
|
feeds = get_feeds()
|
||||||
|
urls = []
|
||||||
|
for keyword, feed in feeds:
|
||||||
|
alerts = get_links_from_feed(feed)
|
||||||
|
for alert in alerts:
|
||||||
|
urls.append(alert.url)
|
||||||
|
pages = []
|
||||||
|
for url in urls:
|
||||||
|
content = await fetch_site(url)
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
pages.append({
|
||||||
|
"url": url,
|
||||||
|
"content": clean_string(content)
|
||||||
|
})
|
||||||
|
return pages
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print(await get_all_feed_contents())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,18 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
def get_feeds() -> list[tuple[str, str]]:
|
||||||
|
"""Reads feed names and URLs from a local JSON file.
|
||||||
|
|
||||||
|
This function opens 'feeds.json', which is expected to be in the
|
||||||
|
same directory as this script. It parses the JSON object, which
|
||||||
|
should contain string keys (feed names) and string values (URLs).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[tuple[str, str]]: A list of tuples, where each tuple
|
||||||
|
contains a feed's name and its URL.
|
||||||
|
"""
|
||||||
|
file_path = os.path.join(os.path.dirname(__file__), "./feeds.json")
|
||||||
|
with open(file_path, "r") as f:
|
||||||
|
data: dict[str, str] = json.load(f)
|
||||||
|
return list(data.items())
|
|
@ -0,0 +1,61 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import feedparser
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Alert:
|
||||||
|
"""A simple data class to hold information about a single alert."""
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
summary: str
|
||||||
|
|
||||||
|
def get_links_from_feed(rss_url: str) -> list[Alert]:
|
||||||
|
"""
|
||||||
|
Parses a Google Alerts RSS feed URL and extracts the data for each alert.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rss_url: The URL of the Google Alerts RSS feed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Alert objects. Returns an empty list if the feed
|
||||||
|
cannot be parsed or is empty.
|
||||||
|
"""
|
||||||
|
alerts: list[Alert] = []
|
||||||
|
# Parse the RSS feed from the provided URL
|
||||||
|
feed = feedparser.parse(rss_url)
|
||||||
|
|
||||||
|
# Check if the feed was parsed successfully and has entries
|
||||||
|
if feed.bozo:
|
||||||
|
print(f"Error parsing feed: {feed.bozo_exception}")
|
||||||
|
return alerts
|
||||||
|
|
||||||
|
# Iterate over each entry in the feed
|
||||||
|
for entry in feed.entries:
|
||||||
|
# The title is directly available
|
||||||
|
title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
|
||||||
|
title = title_soup.get_text()
|
||||||
|
|
||||||
|
# The summary often contains HTML, so we parse it to get clean text.
|
||||||
|
summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
|
||||||
|
summary = summary_soup.get_text()
|
||||||
|
|
||||||
|
# The link is a Google redirect URL; we extract the 'url' parameter.
|
||||||
|
link = entry.link
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Parse the URL to easily access its components
|
||||||
|
parsed_url = urllib.parse.urlparse(link) #type: ignore
|
||||||
|
# Get the query parameters as a dictionary
|
||||||
|
query_params = urllib.parse.parse_qs(parsed_url.query)
|
||||||
|
# The actual destination URL is in the 'url' parameter
|
||||||
|
actual_url = query_params.get('url', [None])[0]
|
||||||
|
|
||||||
|
if actual_url:
|
||||||
|
# Append an Alert object instead of a tuple
|
||||||
|
alert_obj = Alert(title=title, url=actual_url, summary=summary)
|
||||||
|
alerts.append(alert_obj)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Could not parse URL for entry '{title}': {e}")
|
||||||
|
|
||||||
|
return alerts
|
|
@ -0,0 +1,196 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional
|
||||||
|
import google.generativeai as genai
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from get_all_feed_contents import get_all_feed_contents
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
||||||
|
|
||||||
|
INPUT_FILE = "./page_content.json"
|
||||||
|
|
||||||
|
MODEL_NAME = "gemini-2.0-flash-lite"
|
||||||
|
|
||||||
|
# TODO: refine
|
||||||
|
EXTRACTION_PROMPT = """
|
||||||
|
From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields:
|
||||||
|
|
||||||
|
transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract")
|
||||||
|
company_division - Company or division name
|
||||||
|
recipient - Recipient of the transaction
|
||||||
|
amount - Transaction amount (defaults to 0)
|
||||||
|
description - Transaction description
|
||||||
|
address_1, address_2, city, province, region, postal_code - Address fields
|
||||||
|
source_date - Date in YYYY-MM-DD format
|
||||||
|
source_description - Source description
|
||||||
|
grant_type - Type of grant
|
||||||
|
commodity_class - Commodity classification
|
||||||
|
contract_number - Contract number
|
||||||
|
comments - Additional comments
|
||||||
|
is_primary - Boolean flag (defaults to false)
|
||||||
|
|
||||||
|
|
||||||
|
Do not hallucinate. If a field cannot be detemined from the text, leave it empty.
|
||||||
|
|
||||||
|
---
|
||||||
|
DOCUMENT TEXT:
|
||||||
|
{text_content}
|
||||||
|
"""
|
||||||
|
|
||||||
|
SCHEMA = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"transaction_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')"
|
||||||
|
},
|
||||||
|
"company_division": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Company or division name"
|
||||||
|
},
|
||||||
|
"recipient": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Recipient of the transaction"
|
||||||
|
},
|
||||||
|
"amount": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "Transaction amount",
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Transaction description"
|
||||||
|
},
|
||||||
|
"address_1": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Address line 1"
|
||||||
|
},
|
||||||
|
"address_2": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Address line 2"
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "City"
|
||||||
|
},
|
||||||
|
"province": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Province/State"
|
||||||
|
},
|
||||||
|
"region": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Region"
|
||||||
|
},
|
||||||
|
"postal_code": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Postal code"
|
||||||
|
},
|
||||||
|
"source_date": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time",
|
||||||
|
"description": "Date in YYYY-MM-DD format"
|
||||||
|
},
|
||||||
|
"source_description": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Source description"
|
||||||
|
},
|
||||||
|
"grant_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Type of grant"
|
||||||
|
},
|
||||||
|
"commodity_class": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Commodity classification"
|
||||||
|
},
|
||||||
|
"contract_number": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Contract number"
|
||||||
|
},
|
||||||
|
"comments": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Additional comments"
|
||||||
|
},
|
||||||
|
"is_primary": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Boolean flag indicating if it's primary",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def process_content_with_gemini(text_content):
|
||||||
|
"""
|
||||||
|
Sends the text to the Gemini API with the extraction prompt and
|
||||||
|
parses the JSON response.
|
||||||
|
"""
|
||||||
|
model = genai.GenerativeModel(MODEL_NAME) # type: ignore
|
||||||
|
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = model.generate_content(
|
||||||
|
prompt,
|
||||||
|
generation_config={
|
||||||
|
"response_schema": SCHEMA,
|
||||||
|
"response_mime_type": 'application/json',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return json.loads(response.text)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main function to run the data extraction process."""
|
||||||
|
if not GOOGLE_API_KEY:
|
||||||
|
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
|
||||||
|
return
|
||||||
|
|
||||||
|
genai.configure(api_key=GOOGLE_API_KEY) # type: ignore
|
||||||
|
|
||||||
|
print("Retrieving all feed contents...")
|
||||||
|
scraped_pages = await get_all_feed_contents()
|
||||||
|
if not scraped_pages:
|
||||||
|
print("❌ Error: No scraper results found.")
|
||||||
|
return
|
||||||
|
print("✅ Successfully retrieved all feed contents.")
|
||||||
|
|
||||||
|
all_extracted_deals = []
|
||||||
|
total_pages = len(scraped_pages)
|
||||||
|
|
||||||
|
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
|
||||||
|
|
||||||
|
for i, page in enumerate(scraped_pages):
|
||||||
|
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
|
||||||
|
|
||||||
|
# Avoid processing pages with very little text
|
||||||
|
if len(page.get('content', '')) < 150:
|
||||||
|
print(" ⏩ Skipping page due to insufficient content.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
extracted_info = process_content_with_gemini(page['content'])
|
||||||
|
|
||||||
|
# Check if the extraction was successful and contains actual data
|
||||||
|
if extracted_info and "error" not in extracted_info:
|
||||||
|
if ("transaction_type" in extracted_info) and ("company_division" in extracted_info) and ("recipient" in extracted_info):
|
||||||
|
print(" ✔️ Found relevant info")
|
||||||
|
all_extracted_deals.append(extracted_info)
|
||||||
|
else:
|
||||||
|
print(" ❌ insufficient info")
|
||||||
|
print(f" Extracted info: {extracted_info}")
|
||||||
|
|
||||||
|
# Add a small delay to respect API rate limits (1 second is safe)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if all_extracted_deals:
|
||||||
|
for transaction in all_extracted_deals:
|
||||||
|
requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction)
|
||||||
|
else:
|
||||||
|
print("\nNo relevant deals were extracted from any of the pages.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
|
@ -0,0 +1,48 @@
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.9.0
|
||||||
|
beautifulsoup4==4.13.4
|
||||||
|
cachetools==5.5.2
|
||||||
|
certifi==2025.7.14
|
||||||
|
charset-normalizer==3.4.2
|
||||||
|
colorama==0.4.6
|
||||||
|
dnspython==2.7.0
|
||||||
|
email_validator==2.2.0
|
||||||
|
feedparser==6.0.11
|
||||||
|
google-ai-generativelanguage==0.6.15
|
||||||
|
google-api-core==2.25.1
|
||||||
|
google-api-python-client==2.177.0
|
||||||
|
google-auth==2.40.3
|
||||||
|
google-auth-httplib2==0.2.0
|
||||||
|
google-genai==1.28.0
|
||||||
|
google-generativeai==0.8.5
|
||||||
|
googleapis-common-protos==1.70.0
|
||||||
|
greenlet==3.2.3
|
||||||
|
grpcio==1.74.0
|
||||||
|
grpcio-status==1.71.2
|
||||||
|
h11==0.16.0
|
||||||
|
httpcore==1.0.9
|
||||||
|
httplib2==0.22.0
|
||||||
|
httpx==0.28.1
|
||||||
|
idna==3.10
|
||||||
|
playwright==1.54.0
|
||||||
|
proto-plus==1.26.1
|
||||||
|
protobuf==5.29.5
|
||||||
|
pyasn1==0.6.1
|
||||||
|
pyasn1_modules==0.4.2
|
||||||
|
pydantic==2.11.7
|
||||||
|
pydantic_core==2.33.2
|
||||||
|
pyee==13.0.0
|
||||||
|
pyparsing==3.2.3
|
||||||
|
python-dotenv==1.1.1
|
||||||
|
requests==2.32.4
|
||||||
|
rsa==4.9.1
|
||||||
|
sgmllib3k==1.0.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
soupsieve==2.7
|
||||||
|
tenacity==8.5.0
|
||||||
|
tqdm==4.67.1
|
||||||
|
typing-inspection==0.4.1
|
||||||
|
typing_extensions==4.14.1
|
||||||
|
uritemplate==4.2.0
|
||||||
|
urllib3==2.5.0
|
||||||
|
websockets==15.0.1
|
Loading…
Reference in New Issue