ploughshares/docker/crawler/analyze.py

import google.generativeai as genai
import json
import os
import time
from dotenv import load_dotenv
load_dotenv()

GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

# json generated by the scraper (markeltine_crawler.py)
INPUT_FILE = os.path.join("crawl_results", "successful_pages.json")

# output JSON any extracted deals from the scraped data
OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json")

MODEL_NAME = "gemini-2.0-flash-lite"

# TODO: refine
EXTRACTION_PROMPT = """
From the document text provided below, extract key details about any military or arms exports.

Your task is to identify the following:
- "company_name": The name of the company involved in manufacturing or selling.
- "weapon_system": The specific type of weapon, vehicle, or military equipment.
- "destination_country": The country receiving the goods.
- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD").
- "summary": A concise, one-sentence summary of the export deal or report.

If a specific piece of information cannot be found in the text, you MUST use the value "Not Found".

Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON.

---
DOCUMENT TEXT:
{text_content}
"""

def load_scraped_data(filepath):
    """Loads the scraped data from the JSON file."""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"❌ Error: Input file not found at '{filepath}'.")
        print("Ensure you have run the scraper first.")
        return None

def save_extracted_data(filepath, data):
    """Saves the final extracted data to a new JSON file."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"\n✅ Success! Saved extracted info to '{filepath}'.")


def process_content_with_gemini(text_content):
    """
    Sends the text to the Gemini API with the extraction prompt and
    parses the JSON response.
    """
    model = genai.GenerativeModel(MODEL_NAME)
    prompt = EXTRACTION_PROMPT.format(text_content=text_content)

    try:
        response = model.generate_content(prompt)
        # Clean the response to ensure it's valid JSON. Gemini sometimes
        # wraps its JSON response in markdown backticks.
        clean_json = response.text.strip().replace("```json", "").replace("```", "")
        # print("GOT: ", clean_json)
        return json.loads(clean_json)
    except Exception as e:
        print(f"   ❌ An error occurred while calling Gemini or parsing its response: {e}")
        return {"error": str(e)}


def main():
    """Main function to run the data extraction process."""
    if not GOOGLE_API_KEY:
        print("❌ Error: GOOGLE_API_KEY environment variable not set.")
        return

    genai.configure(api_key=GOOGLE_API_KEY)

    scraped_pages = load_scraped_data(INPUT_FILE)
    if not scraped_pages:
        print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json")
        return

    all_extracted_deals = []
    total_pages = len(scraped_pages)

    print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")

    for i, page in enumerate(scraped_pages):
        print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")

        # Avoid processing pages with very little text
        if len(page.get('content', '')) < 150:
            print("   ⏩ Skipping page due to insufficient content.")
            continue

        extracted_info = process_content_with_gemini(page['content'])

        # Check if the extraction was successful and contains actual data
        if extracted_info and "error" not in extracted_info:
            if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found":
                print(f"   ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}")
                # Add the source URL for reference
                extracted_info['source_url'] = page['url']
                all_extracted_deals.append(extracted_info)
            else:
                 print("   ⚪ No relevant deals found on this page.")

        # Add a small delay to respect API rate limits (1 second is safe)
        time.sleep(1)

    if all_extracted_deals:
        save_extracted_data(OUTPUT_FILE, all_extracted_deals)
    else:
        print("\nNo relevant deals were extracted from any of the pages.")

if __name__ == "__main__":
    main()