better gemini extraction + results to API
ci/woodpecker/push/woodpecker Pipeline was successful Details

This commit is contained in:
coleWesterveld 2025-08-09 14:02:17 -04:00
parent c11ec3f09c
commit a21a716525
4 changed files with 268 additions and 45 deletions

View File

@ -1,42 +1,99 @@
"""
analyze.py
Reads scraped pages (crawl_results/successful_pages.json),
sends each page to Gemini for structured extraction, and writes
API-ready transactions to crawl_results/extracted_arms_deals.json.
- The Gemini prompt requests output that *matches the API's expected fields*.
- Each output object includes `canadian_relevance` and `relation_explanation`
so we can filter out non-Canadian items while still capturing indirect cases.
"""
import google.generativeai as genai
import json
import os
import re
import time
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# json generated by the scraper (markeltine_crawler.py)
# json generated by the scraper (marketline_crawler.py)
INPUT_FILE = os.path.join("crawl_results", "successful_pages.json")
# output JSON any extracted deals from the scraped data
# output JSON any extracted deals from the scraped data (API-ready schema)
OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json")
MODEL_NAME = "gemini-2.0-flash-lite"
# TODO: refine
# Prompt: instruct model to return API schema fields and to explicitly indicate
# if and how the result is related to Canada (direct, indirect, none).
EXTRACTION_PROMPT = """
From the document text provided below, extract key details about any military or arms exports.
You are a precise data-extraction system.
Your task is to identify the following:
- "company_name": The name of the company involved in manufacturing or selling.
- "weapon_system": The specific type of weapon, vehicle, or military equipment.
- "destination_country": The country receiving the goods.
- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD").
- "summary": A concise, one-sentence summary of the export deal or report.
Given the DOCUMENT TEXT below, extract ALL transactions or arms-export relevant
entries and output a JSON array (possibly empty) of objects that match the
Project Ploughshares API schema. Output ONLY the JSON array no markdown,
no commentary, no code fences.
If a specific piece of information cannot be found in the text, you MUST use the value "Not Found".
Each object must use the following fields (required fields must be provided
and set to "Not Found" if absent):
Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON.
Required fields:
- transaction_type (string) # e.g., "Export", "Purchase Order", "Component Supply"
- company_division (string) # company or division name (use "Not Found" if unknown)
- recipient (string) # receiving country or recipient (use "Not Found" if unknown)
Optional fields (include if present):
- amount (string or number) # monetary value if present (e.g., "15,000,000 CAD")
- description (string)
- address_1, address_2, city, province, region, postal_code
- source_date (string YYYY-MM-DD)
- source_description (string)
- grant_type (string)
- commodity_class (string) # e.g., missile components, avionics, engines
- contract_number (string)
- comments (string)
- is_primary (boolean)
Additionally, include these two new fields to help filter relevance:
- canadian_relevance (string) # one of: "direct", "indirect", "none"
- "direct" = Canadian company or Canada-origin export of military goods/components
- "indirect" = Canadian-made parts/components appear in a larger export (final assembly elsewhere)
- "none" = no meaningful Canadian connection
- relation_explanation (string) # short explanation why this is direct/indirect/none (1-2 sentences)
Rules:
1. If a piece of info cannot be found, set it to the string "Not Found" (not null).
2. If multiple transactions are described in the text, output them as separate objects.
3. If the text contains the same transaction repeated, ensure you only output one object per distinct transaction.
4. Output must be valid JSON (an array). Example:
[
{{
"transaction_type": "Export",
"company_division": "Example Corp Canada",
"recipient": "Country X",
"amount": "3,000,000 CAD",
"commodity_class": "avionics modules",
"description": "Example summary ...",
"source_url": "https://example.com/article",
"canadian_relevance": "direct",
"relation_explanation": "Company is based in Canada and shipped avionics modules."
}}
]
---
DOCUMENT TEXT:
{text_content}
"""
# -------------------------
# Helper functions
# -------------------------
def load_scraped_data(filepath):
"""Loads the scraped data from the JSON file."""
"""Loads the scraped data from the JSON file created by the crawler."""
try:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
@ -45,39 +102,95 @@ def load_scraped_data(filepath):
print("Ensure you have run the scraper first.")
return None
def save_extracted_data(filepath, data):
"""Saves the final extracted data to a new JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\n✅ Success! Saved extracted info to '{filepath}'.")
def extract_json_from_text(text):
"""
Attempts to find and return the first JSON array or object in a text blob.
This removes markdown fences and extracts from the first '[' ... ']' or '{' ... '}' pair.
"""
if not text or not isinstance(text, str):
return None
# remove common fences
cleaned = text.strip()
cleaned = cleaned.replace("```json", "").replace("```", "").strip()
# Try to locate a JSON array first
arr_match = re.search(r"(\[.*\])", cleaned, flags=re.DOTALL)
if arr_match:
return arr_match.group(1)
# Otherwise try a single JSON object
obj_match = re.search(r"(\{.*\})", cleaned, flags=re.DOTALL)
if obj_match:
return obj_match.group(1)
return None
def process_content_with_gemini(text_content):
"""
Sends the text to the Gemini API with the extraction prompt and
parses the JSON response.
Sends the text to Gemini with the extraction prompt and parses the JSON response.
Uses your existing SDK usage pattern (genai.GenerativeModel).
"""
# Keep using your existing model init pattern
model = genai.GenerativeModel(MODEL_NAME)
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
try:
# Generate content. Your original code used model.generate_content(prompt)
response = model.generate_content(prompt)
# Clean the response to ensure it's valid JSON. Gemini sometimes
# wraps its JSON response in markdown backticks.
clean_json = response.text.strip().replace("```json", "").replace("```", "")
# print("GOT: ", clean_json)
return json.loads(clean_json)
# Response object in your environment exposes .text (as in your original script)
raw = getattr(response, "text", str(response))
# Try to extract JSON from the possibly noisy response
json_fragment = extract_json_from_text(raw) or raw
# Parse JSON
parsed = json.loads(json_fragment)
# Ensure it's an array
if isinstance(parsed, dict):
parsed = [parsed]
return parsed
except Exception as e:
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
# print raw text to help debugging if available
try:
print(" Raw response (truncated):", raw[:1000])
except Exception:
pass
return {"error": str(e)}
def is_valid_transaction(tx):
"""
Basic validation to ensure required API fields exist.
Required fields (per API): transaction_type, company_division, recipient
If a field is present but "Not Found", treat as missing for the
purposes of deciding whether to keep the record (we still surface it sometimes).
"""
for field in ["transaction_type", "company_division", "recipient"]:
if field not in tx or not tx[field] or tx[field] == "Not Found":
return False
return True
# -------------------------
# Main orchestration
# -------------------------
def main():
"""Main function to run the data extraction process."""
if not GOOGLE_API_KEY:
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
return
# Configure the SDK (this is your existing working pattern)
genai.configure(api_key=GOOGLE_API_KEY)
scraped_pages = load_scraped_data(INPUT_FILE)
@ -91,32 +204,59 @@ def main():
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
for i, page in enumerate(scraped_pages):
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
url = page.get("url", "unknown_url")
print(f"\nProcessing page {i+1}/{total_pages}: {url}")
# Avoid processing pages with very little text
if len(page.get('content', '')) < 150:
text = page.get("content", "")
if len(text) < 150:
print(" ⏩ Skipping page due to insufficient content.")
continue
extracted_info = process_content_with_gemini(page['content'])
# Check if the extraction was successful and contains actual data
if extracted_info and "error" not in extracted_info:
if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found":
print(f" ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}")
# Add the source URL for reference
extracted_info['source_url'] = page['url']
all_extracted_deals.append(extracted_info)
else:
print(" ⚪ No relevant deals found on this page.")
# Add a small delay to respect API rate limits (1 second is safe)
extracted_items = process_content_with_gemini(text)
# If model returned a single object or error, handle gracefully
if not extracted_items:
print(" ⚪ Gemini returned no items.")
time.sleep(1)
continue
if isinstance(extracted_items, dict) and "error" in extracted_items:
print(" ⚠️ Gemini error:", extracted_items.get("error"))
time.sleep(1)
continue
# iterate through items (should be array of objects)
for tx in extracted_items:
# attach source_url for traceability
tx.setdefault("source_url", url)
# if the model gives canadian_relevance, use it to decide whether to keep
relevance = (tx.get("canadian_relevance") or "none").lower()
explanation = tx.get("relation_explanation", "")
# If model says 'none', skip by default (these are the irrelevant ones like US missile contracts)
if relevance == "none":
print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200])
continue
# basic required-field check (we want the API-required fields present)
if not is_valid_transaction(tx):
print(" ⚠️ Skipping — missing required API fields in extracted transaction:", tx)
continue
# Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now
# Save the item
all_extracted_deals.append(tx)
print(f" ✔️ Kept transaction: {tx.get('company_division')}{tx.get('recipient')} ({relevance})")
# Respect rate limit
time.sleep(1)
# Save results
if all_extracted_deals:
save_extracted_data(OUTPUT_FILE, all_extracted_deals)
else:
print("\nNo relevant deals were extracted from any of the pages.")
print("\nNo relevant Canadian deals were extracted from any of the pages.")
if __name__ == "__main__":
main()
main()

View File

@ -22,13 +22,16 @@ HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
# the root page to seed crawling
CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
# trying out another page
# CRAWLPAGE_URL = "https://www.defensenews.com/"
# name of file where cookies are saved
COOKIES_FILE = "marketline_cookies.json"
# --- CRAWLER SETTINGS ---
DEPTH = 2
COUNT = 10 # Increased for better testing
DEPTH = 3
COUNT = 100
# TODO: maybe make this list more comprehensive?
SCRAPER_KEYWORDS = [
@ -177,7 +180,7 @@ async def crawl_with_saved_cookies():
if __name__ == "__main__":
# Choose which function to run
# 1. First, run the login function once to get your cookies
asyncio.run(login_and_save_cookies())
# asyncio.run(login_and_save_cookies())
# 2. Then, comment out the login line and run the crawl
# asyncio.run(crawl_with_saved_cookies())
asyncio.run(crawl_with_saved_cookies())

16
docker/crawler/run_all.sh Normal file
View File

@ -0,0 +1,16 @@
#!/bin/bash
# runs from start to finish the scraper, analyzer and then writes to the API
set -e # exit immediately if any step fails
echo "📡 Crawling data..."
python marketline_crawler.py
echo "🧠 Analyzing with Gemini..."
python analyze.py crawled_data.json results.json
echo "📤 Sending to API..."
python write_to_api.py results.json
echo "✅ All done!"

View File

@ -0,0 +1,64 @@
import json
import requests
import sys
API_BASE_URL = "http://ploughshares.nixc.us/api/transaction"
HEADERS = {"Content-Type": "application/json"}
allowed_fields = {
"transaction_type", "company_division", "recipient", "amount",
"description", "address_1", "address_2", "city", "province", "region",
"postal_code", "source_date", "source_description", "grant_type",
"commodity_class", "contract_number", "comments", "is_primary"
}
def clean_for_api(tx):
cleaned = {k: v for k, v in tx.items() if k in allowed_fields}
# Remove invalid source_date
if "source_date" in cleaned:
if not isinstance(cleaned["source_date"], str) or cleaned["source_date"].lower() == "not found":
cleaned.pop("source_date")
# Remove invalid amount (API expects numeric)
if "amount" in cleaned:
# If "Not Found" or not parseable as a float, drop it
try:
float(str(cleaned["amount"]).replace(",", "").replace("$", ""))
except ValueError:
cleaned.pop("amount")
# Use source_url for source_description
if "source_url" in tx:
cleaned["source_description"] = tx["source_url"]
return cleaned
def post_transaction(transaction):
payload = clean_for_api(transaction)
response = requests.post(API_BASE_URL, headers=HEADERS, json=payload)
if response.status_code == 200 or response.status_code == 201:
print(f"✅ Created transaction for {payload['company_division']} → ID: {response.json().get('transaction_id')}")
else:
print(f"❌ Failed to create transaction: {response.status_code} - {response.text}")
def main(json_file_path):
with open(json_file_path, "r", encoding="utf-8") as f:
transactions = json.load(f)
if not isinstance(transactions, list):
transactions = [transactions]
for tx in transactions:
try:
post_transaction(tx)
except Exception as e:
print(f"Error posting transaction: {e}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python write_to_api.py results.json")
sys.exit(1)
main(sys.argv[1])