122 lines
4.6 KiB
Python
122 lines
4.6 KiB
Python
import google.generativeai as genai
|
|
import json
|
|
import os
|
|
import time
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
|
|
|
# json generated by the scraper (markeltine_crawler.py)
|
|
INPUT_FILE = os.path.join("crawl_results", "successful_pages.json")
|
|
|
|
# output JSON any extracted deals from the scraped data
|
|
OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json")
|
|
|
|
MODEL_NAME = "gemini-2.0-flash-lite"
|
|
|
|
# TODO: refine
|
|
EXTRACTION_PROMPT = """
|
|
From the document text provided below, extract key details about any military or arms exports.
|
|
|
|
Your task is to identify the following:
|
|
- "company_name": The name of the company involved in manufacturing or selling.
|
|
- "weapon_system": The specific type of weapon, vehicle, or military equipment.
|
|
- "destination_country": The country receiving the goods.
|
|
- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD").
|
|
- "summary": A concise, one-sentence summary of the export deal or report.
|
|
|
|
If a specific piece of information cannot be found in the text, you MUST use the value "Not Found".
|
|
|
|
Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON.
|
|
|
|
---
|
|
DOCUMENT TEXT:
|
|
{text_content}
|
|
"""
|
|
|
|
def load_scraped_data(filepath):
|
|
"""Loads the scraped data from the JSON file."""
|
|
try:
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
print(f"❌ Error: Input file not found at '{filepath}'.")
|
|
print("Ensure you have run the scraper first.")
|
|
return None
|
|
|
|
def save_extracted_data(filepath, data):
|
|
"""Saves the final extracted data to a new JSON file."""
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
print(f"\n✅ Success! Saved extracted info to '{filepath}'.")
|
|
|
|
|
|
def process_content_with_gemini(text_content):
|
|
"""
|
|
Sends the text to the Gemini API with the extraction prompt and
|
|
parses the JSON response.
|
|
"""
|
|
model = genai.GenerativeModel(MODEL_NAME)
|
|
prompt = EXTRACTION_PROMPT.format(text_content=text_content)
|
|
|
|
try:
|
|
response = model.generate_content(prompt)
|
|
# Clean the response to ensure it's valid JSON. Gemini sometimes
|
|
# wraps its JSON response in markdown backticks.
|
|
clean_json = response.text.strip().replace("```json", "").replace("```", "")
|
|
# print("GOT: ", clean_json)
|
|
return json.loads(clean_json)
|
|
except Exception as e:
|
|
print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}")
|
|
return {"error": str(e)}
|
|
|
|
|
|
def main():
|
|
"""Main function to run the data extraction process."""
|
|
if not GOOGLE_API_KEY:
|
|
print("❌ Error: GOOGLE_API_KEY environment variable not set.")
|
|
return
|
|
|
|
genai.configure(api_key=GOOGLE_API_KEY)
|
|
|
|
scraped_pages = load_scraped_data(INPUT_FILE)
|
|
if not scraped_pages:
|
|
print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json")
|
|
return
|
|
|
|
all_extracted_deals = []
|
|
total_pages = len(scraped_pages)
|
|
|
|
print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
|
|
|
|
for i, page in enumerate(scraped_pages):
|
|
print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
|
|
|
|
# Avoid processing pages with very little text
|
|
if len(page.get('content', '')) < 150:
|
|
print(" ⏩ Skipping page due to insufficient content.")
|
|
continue
|
|
|
|
extracted_info = process_content_with_gemini(page['content'])
|
|
|
|
# Check if the extraction was successful and contains actual data
|
|
if extracted_info and "error" not in extracted_info:
|
|
if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found":
|
|
print(f" ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}")
|
|
# Add the source URL for reference
|
|
extracted_info['source_url'] = page['url']
|
|
all_extracted_deals.append(extracted_info)
|
|
else:
|
|
print(" ⚪ No relevant deals found on this page.")
|
|
|
|
# Add a small delay to respect API rate limits (1 second is safe)
|
|
time.sleep(1)
|
|
|
|
if all_extracted_deals:
|
|
save_extracted_data(OUTPUT_FILE, all_extracted_deals)
|
|
else:
|
|
print("\nNo relevant deals were extracted from any of the pages.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |