add dork crawler
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
f06d01613f
commit
c11ec3f09c
|
@ -0,0 +1,3 @@
|
|||
.env
|
||||
page_content.json
|
||||
__pycache__/
|
|
@ -0,0 +1,24 @@
|
|||
def clean_string(input_string: str) -> str:
|
||||
"""
|
||||
Cleans a multi-line string by trimming whitespace and removing empty lines.
|
||||
|
||||
This function takes a string, splits it into lines, removes any leading or
|
||||
trailing whitespace from each line, discards any lines that become empty
|
||||
after trimming, and then joins the non-empty lines back together with
|
||||
a single newline character between them.
|
||||
|
||||
Args:
|
||||
input_string: The string to be cleaned.
|
||||
|
||||
Returns:
|
||||
A new string with whitespace-trimmed lines and no empty lines.
|
||||
"""
|
||||
# Use a list comprehension for a concise solution:
|
||||
# 1. input_string.split('\n'): Splits the string into a list of lines.
|
||||
# 2. line.strip(): Removes leading/trailing whitespace from each line.
|
||||
# 3. if line.strip(): This condition filters out any strings that are empty
|
||||
# after being stripped of whitespace.
|
||||
# 4. '\n'.join(...): Joins the elements of the resulting list into a
|
||||
# single string, with each element separated by a newline.
|
||||
cleaned_lines = [line.strip() for line in input_string.split('\n') if line.strip()]
|
||||
return '\n'.join(cleaned_lines)
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"Canadian Military Exports": "https://www.google.ca/alerts/feeds/02962857334213646081/4156920188674433267"
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
async def fetch_site(url: str) -> str | None:
|
||||
"""
|
||||
Fetches the text content of a URL using Playwright.
|
||||
|
||||
Args:
|
||||
url: The URL of the website to fetch.
|
||||
|
||||
Returns:
|
||||
A string containing the text content of the page, or None on error.
|
||||
"""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
# Change 'networkidle' to 'domcontentloaded' and increase timeout as a fallback
|
||||
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||||
|
||||
content = await page.content()
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
|
||||
# .get_text() is the standard method in modern BeautifulSoup
|
||||
return soup.get_text()
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
finally:
|
||||
await browser.close()
|
|
@ -0,0 +1,31 @@
|
|||
import asyncio
|
||||
import json
|
||||
from clean_string import clean_string
|
||||
from fetch_site import fetch_site
|
||||
from get_feeds import get_feeds
|
||||
from get_links_from_feed import get_links_from_feed
|
||||
|
||||
|
||||
async def get_all_feed_contents():
|
||||
feeds = get_feeds()
|
||||
urls = []
|
||||
for keyword, feed in feeds:
|
||||
alerts = get_links_from_feed(feed)
|
||||
for alert in alerts:
|
||||
urls.append(alert.url)
|
||||
pages = []
|
||||
for url in urls:
|
||||
content = await fetch_site(url)
|
||||
if not content:
|
||||
continue
|
||||
pages.append({
|
||||
"url": url,
|
||||
"content": clean_string(content)
|
||||
})
|
||||
return pages
|
||||
|
||||
async def main():
|
||||
print(await get_all_feed_contents())
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
|
@ -0,0 +1,18 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
def get_feeds() -> list[tuple[str, str]]:
|
||||
"""Reads feed names and URLs from a local JSON file.
|
||||
|
||||
This function opens 'feeds.json', which is expected to be in the
|
||||
same directory as this script. It parses the JSON object, which
|
||||
should contain string keys (feed names) and string values (URLs).
|
||||
|
||||
Returns:
|
||||
list[tuple[str, str]]: A list of tuples, where each tuple
|
||||
contains a feed's name and its URL.
|
||||
"""
|
||||
file_path = os.path.join(os.path.dirname(__file__), "./feeds.json")
|
||||
with open(file_path, "r") as f:
|
||||
data: dict[str, str] = json.load(f)
|
||||
return list(data.items())
|
|
@ -0,0 +1,61 @@
|
|||
from dataclasses import dataclass
|
||||
from bs4 import BeautifulSoup
|
||||
import feedparser
|
||||
import urllib.parse
|
||||
|
||||
@dataclass
|
||||
class Alert:
|
||||
"""A simple data class to hold information about a single alert."""
|
||||
title: str
|
||||
url: str
|
||||
summary: str
|
||||
|
||||
def get_links_from_feed(rss_url: str) -> list[Alert]:
|
||||
"""
|
||||
Parses a Google Alerts RSS feed URL and extracts the data for each alert.
|
||||
|
||||
Args:
|
||||
rss_url: The URL of the Google Alerts RSS feed.
|
||||
|
||||
Returns:
|
||||
A list of Alert objects. Returns an empty list if the feed
|
||||
cannot be parsed or is empty.
|
||||
"""
|
||||
alerts: list[Alert] = []
|
||||
# Parse the RSS feed from the provided URL
|
||||
feed = feedparser.parse(rss_url)
|
||||
|
||||
# Check if the feed was parsed successfully and has entries
|
||||
if feed.bozo:
|
||||
print(f"Error parsing feed: {feed.bozo_exception}")
|
||||
return alerts
|
||||
|
||||
# Iterate over each entry in the feed
|
||||
for entry in feed.entries:
|
||||
# The title is directly available
|
||||
title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
|
||||
title = title_soup.get_text()
|
||||
|
||||
# The summary often contains HTML, so we parse it to get clean text.
|
||||
summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
|
||||
summary = summary_soup.get_text()
|
||||
|
||||
# The link is a Google redirect URL; we extract the 'url' parameter.
|
||||
link = entry.link
|
||||
|
||||
try:
|
||||
# Parse the URL to easily access its components
|
||||
parsed_url = urllib.parse.urlparse(link) #type: ignore
|
||||
# Get the query parameters as a dictionary
|
||||
query_params = urllib.parse.parse_qs(parsed_url.query)
|
||||
# The actual destination URL is in the 'url' parameter
|
||||
actual_url = query_params.get('url', [None])[0]
|
||||
|
||||
if actual_url:
|
||||
# Append an Alert object instead of a tuple
|
||||
alert_obj = Alert(title=title, url=actual_url, summary=summary)
|
||||
alerts.append(alert_obj)
|
||||
except Exception as e:
|
||||
print(f"Could not parse URL for entry '{title}': {e}")
|
||||
|
||||
return alerts
|
|
@ -0,0 +1,196 @@
|
|||
import asyncio
|
||||
from typing import Optional
|
||||
import google.generativeai as genai
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field
|
||||
import requests
|
||||
|
||||
from get_all_feed_contents import get_all_feed_contents
|
||||
load_dotenv()
|
||||
|
||||
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
||||
|
||||
INPUT_FILE = "./page_content.json"
|
||||
|
||||
MODEL_NAME = "gemini-2.0-flash-lite"
|
||||
|
||||
# TODO: refine
|
||||
EXTRACTION_PROMPT = """
|
||||
From the document text provided below, extract key details about any military or arms exports. More specifically, look for the following fields:
|
||||
|
||||
transaction_type - Type of transaction (e.g., "Purchase Order", "Subcontract")
|
||||
company_division - Company or division name
|
||||
recipient - Recipient of the transaction
|
||||
amount - Transaction amount (defaults to 0)
|
||||
description - Transaction description
|
||||
address_1, address_2, city, province, region, postal_code - Address fields
|
||||
source_date - Date in YYYY-MM-DD format
|
||||
source_description - Source description
|
||||
grant_type - Type of grant
|
||||
commodity_class - Commodity classification
|
||||
contract_number - Contract number
|
||||
comments - Additional comments
|
||||
is_primary - Boolean flag (defaults to false)
|
||||
|
||||
|
||||
Do not hallucinate. If a field cannot be detemined from the text, leave it empty.
|
||||
|
||||
---
|
||||
DOCUMENT TEXT:
|
||||
{text_content}
|
||||
"""
|
||||
|
||||
SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"transaction_type": {
|
||||
"type": "string",
|
||||
"description": "Type of transaction (e.g., 'Purchase Order', 'Subcontract')"
|
||||
},
|
||||
"company_division": {
|
||||
"type": "string",
|
||||
"description": "Company or division name"
|
||||
},
|
||||
"recipient": {
|
||||
"type": "string",
|
||||
"description": "Recipient of the transaction"
|
||||
},
|
||||
"amount": {
|
||||
"type": "number",
|
||||
"description": "Transaction amount",
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Transaction description"
|
||||
},
|
||||
"address_1": {
|
||||
"type": "string",
|
||||
"description": "Address line 1"
|
||||
},
|
||||
"address_2": {
|
||||
"type": "string",
|
||||
"description": "Address line 2"
|
||||
},
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "City"
|
||||
},
|
||||
"province": {
|
||||
"type": "string",
|
||||
"description": "Province/State"
|
||||
},
|
||||
"region": {
|
||||
"type": "string",
|
||||
"description": "Region"
|
||||
},
|
||||
"postal_code": {
|
||||
"type": "string",
|
||||
"description": "Postal code"
|
||||
},
|
||||
"source_date": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Date in YYYY-MM-DD format"
|
||||
},
|
||||
"source_description": {
|
||||
"type": "string",
|
||||
"description": "Source description"
|
||||
},
|
||||
"grant_type": {
|
||||
"type": "string",
|
||||
"description": "Type of grant"
|
||||
},
|
||||
"commodity_class": {
|
||||
"type": "string",
|
||||
"description": "Commodity classification"
|
||||
},
|
||||
"contract_number": {
|
||||
"type": "string",
|
||||
"description": "Contract number"
|
||||
},
|
||||
"comments": {
|
||||
"type": "string",
|
||||
"description": "Additional comments"
|
||||
},
|
||||
"is_primary": {
|
||||
"type": "boolean",
|
||||
"description": "Boolean flag indicating if it's primary",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def process_content_with_gemini(text_content):
|
||||
"""
|
||||
Sends the text to the Gemini API with the extraction prompt and
|
||||
parses the JSON response.
|
||||
"""
|
||||
model = genai.GenerativeModel(MODEL_NAME) # type: ignore
|
||||
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
|
||||
|
||||
try:
|
||||
response = model.generate_content(
|
||||
prompt,
|
||||
generation_config={
|
||||
"response_schema": SCHEMA,
|
||||
"response_mime_type": 'application/json',
|
||||
}
|
||||
)
|
||||
return json.loads(response.text)
|
||||
except Exception as e:
|
||||
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main function to run the data extraction process."""
|
||||
if not GOOGLE_API_KEY:
|
||||
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
|
||||
return
|
||||
|
||||
genai.configure(api_key=GOOGLE_API_KEY) # type: ignore
|
||||
|
||||
print("Retrieving all feed contents...")
|
||||
scraped_pages = await get_all_feed_contents()
|
||||
if not scraped_pages:
|
||||
print("❌ Error: No scraper results found.")
|
||||
return
|
||||
print("✅ Successfully retrieved all feed contents.")
|
||||
|
||||
all_extracted_deals = []
|
||||
total_pages = len(scraped_pages)
|
||||
|
||||
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
|
||||
|
||||
for i, page in enumerate(scraped_pages):
|
||||
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
|
||||
|
||||
# Avoid processing pages with very little text
|
||||
if len(page.get('content', '')) < 150:
|
||||
print(" ⏩ Skipping page due to insufficient content.")
|
||||
continue
|
||||
|
||||
extracted_info = process_content_with_gemini(page['content'])
|
||||
|
||||
# Check if the extraction was successful and contains actual data
|
||||
if extracted_info and "error" not in extracted_info:
|
||||
if ("transaction_type" in extracted_info) and ("company_division" in extracted_info) and ("recipient" in extracted_info):
|
||||
print(" ✔️ Found relevant info")
|
||||
all_extracted_deals.append(extracted_info)
|
||||
else:
|
||||
print(" ❌ insufficient info")
|
||||
print(f" Extracted info: {extracted_info}")
|
||||
|
||||
# Add a small delay to respect API rate limits (1 second is safe)
|
||||
time.sleep(1)
|
||||
|
||||
if all_extracted_deals:
|
||||
for transaction in all_extracted_deals:
|
||||
requests.post("https://ploughshares.nixc.us/api/transaction", json=transaction)
|
||||
else:
|
||||
print("\nNo relevant deals were extracted from any of the pages.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
|
@ -0,0 +1,48 @@
|
|||
annotated-types==0.7.0
|
||||
anyio==4.9.0
|
||||
beautifulsoup4==4.13.4
|
||||
cachetools==5.5.2
|
||||
certifi==2025.7.14
|
||||
charset-normalizer==3.4.2
|
||||
colorama==0.4.6
|
||||
dnspython==2.7.0
|
||||
email_validator==2.2.0
|
||||
feedparser==6.0.11
|
||||
google-ai-generativelanguage==0.6.15
|
||||
google-api-core==2.25.1
|
||||
google-api-python-client==2.177.0
|
||||
google-auth==2.40.3
|
||||
google-auth-httplib2==0.2.0
|
||||
google-genai==1.28.0
|
||||
google-generativeai==0.8.5
|
||||
googleapis-common-protos==1.70.0
|
||||
greenlet==3.2.3
|
||||
grpcio==1.74.0
|
||||
grpcio-status==1.71.2
|
||||
h11==0.16.0
|
||||
httpcore==1.0.9
|
||||
httplib2==0.22.0
|
||||
httpx==0.28.1
|
||||
idna==3.10
|
||||
playwright==1.54.0
|
||||
proto-plus==1.26.1
|
||||
protobuf==5.29.5
|
||||
pyasn1==0.6.1
|
||||
pyasn1_modules==0.4.2
|
||||
pydantic==2.11.7
|
||||
pydantic_core==2.33.2
|
||||
pyee==13.0.0
|
||||
pyparsing==3.2.3
|
||||
python-dotenv==1.1.1
|
||||
requests==2.32.4
|
||||
rsa==4.9.1
|
||||
sgmllib3k==1.0.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.7
|
||||
tenacity==8.5.0
|
||||
tqdm==4.67.1
|
||||
typing-inspection==0.4.1
|
||||
typing_extensions==4.14.1
|
||||
uritemplate==4.2.0
|
||||
urllib3==2.5.0
|
||||
websockets==15.0.1
|
Loading…
Reference in New Issue