ploughshares/docker/crawler/analyze.py

122 lines
4.6 KiB
Python

import google.generativeai as genai
import json
import os
import time
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# json generated by the scraper (markeltine_crawler.py)
INPUT_FILE = os.path.join("crawl_results", "successful_pages.json")
# output JSON any extracted deals from the scraped data
OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json")
MODEL_NAME = "gemini-2.0-flash-lite"
# TODO: refine
EXTRACTION_PROMPT = """
From the document text provided below, extract key details about any military or arms exports.
Your task is to identify the following:
- "company_name": The name of the company involved in manufacturing or selling.
- "weapon_system": The specific type of weapon, vehicle, or military equipment.
- "destination_country": The country receiving the goods.
- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD").
- "summary": A concise, one-sentence summary of the export deal or report.
If a specific piece of information cannot be found in the text, you MUST use the value "Not Found".
Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON.
---
DOCUMENT TEXT:
{text_content}
"""
def load_scraped_data(filepath):
"""Loads the scraped data from the JSON file."""
try:
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
print(f"❌ Error: Input file not found at '{filepath}'.")
print("Ensure you have run the scraper first.")
return None
def save_extracted_data(filepath, data):
"""Saves the final extracted data to a new JSON file."""
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"\n✅ Success! Saved extracted info to '{filepath}'.")
def process_content_with_gemini(text_content):
"""
Sends the text to the Gemini API with the extraction prompt and
parses the JSON response.
"""
model = genai.GenerativeModel(MODEL_NAME)
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
try:
response = model.generate_content(prompt)
# Clean the response to ensure it's valid JSON. Gemini sometimes
# wraps its JSON response in markdown backticks.
clean_json = response.text.strip().replace("```json", "").replace("```", "")
# print("GOT: ", clean_json)
return json.loads(clean_json)
except Exception as e:
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
return {"error": str(e)}
def main():
"""Main function to run the data extraction process."""
if not GOOGLE_API_KEY:
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
return
genai.configure(api_key=GOOGLE_API_KEY)
scraped_pages = load_scraped_data(INPUT_FILE)
if not scraped_pages:
print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json")
return
all_extracted_deals = []
total_pages = len(scraped_pages)
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
for i, page in enumerate(scraped_pages):
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
# Avoid processing pages with very little text
if len(page.get('content', '')) < 150:
print(" ⏩ Skipping page due to insufficient content.")
continue
extracted_info = process_content_with_gemini(page['content'])
# Check if the extraction was successful and contains actual data
if extracted_info and "error" not in extracted_info:
if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found":
print(f" ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}")
# Add the source URL for reference
extracted_info['source_url'] = page['url']
all_extracted_deals.append(extracted_info)
else:
print(" ⚪ No relevant deals found on this page.")
# Add a small delay to respect API rate limits (1 second is safe)
time.sleep(1)
if all_extracted_deals:
save_extracted_data(OUTPUT_FILE, all_extracted_deals)
else:
print("\nNo relevant deals were extracted from any of the pages.")
if __name__ == "__main__":
main()