import google.generativeai as genai import json import os import time from dotenv import load_dotenv load_dotenv() GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") # json generated by the scraper (markeltine_crawler.py) INPUT_FILE = os.path.join("crawl_results", "successful_pages.json") # output JSON any extracted deals from the scraped data OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json") MODEL_NAME = "gemini-2.0-flash-lite" # TODO: refine EXTRACTION_PROMPT = """ From the document text provided below, extract key details about any military or arms exports. Your task is to identify the following: - "company_name": The name of the company involved in manufacturing or selling. - "weapon_system": The specific type of weapon, vehicle, or military equipment. - "destination_country": The country receiving the goods. - "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD"). - "summary": A concise, one-sentence summary of the export deal or report. If a specific piece of information cannot be found in the text, you MUST use the value "Not Found". Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON. --- DOCUMENT TEXT: {text_content} """ def load_scraped_data(filepath): """Loads the scraped data from the JSON file.""" try: with open(filepath, "r", encoding="utf-8") as f: return json.load(f) except FileNotFoundError: print(f"❌ Error: Input file not found at '{filepath}'.") print("Ensure you have run the scraper first.") return None def save_extracted_data(filepath, data): """Saves the final extracted data to a new JSON file.""" with open(filepath, "w", encoding="utf-8") as f: json.dump(data, f, indent=4, ensure_ascii=False) print(f"\n✅ Success! Saved extracted info to '{filepath}'.") def process_content_with_gemini(text_content): """ Sends the text to the Gemini API with the extraction prompt and parses the JSON response. """ model = genai.GenerativeModel(MODEL_NAME) prompt = EXTRACTION_PROMPT.format(text_content=text_content) try: response = model.generate_content(prompt) # Clean the response to ensure it's valid JSON. Gemini sometimes # wraps its JSON response in markdown backticks. clean_json = response.text.strip().replace("```json", "").replace("```", "") # print("GOT: ", clean_json) return json.loads(clean_json) except Exception as e: print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}") return {"error": str(e)} def main(): """Main function to run the data extraction process.""" if not GOOGLE_API_KEY: print("❌ Error: GOOGLE_API_KEY environment variable not set.") return genai.configure(api_key=GOOGLE_API_KEY) scraped_pages = load_scraped_data(INPUT_FILE) if not scraped_pages: print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json") return all_extracted_deals = [] total_pages = len(scraped_pages) print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...") for i, page in enumerate(scraped_pages): print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}") # Avoid processing pages with very little text if len(page.get('content', '')) < 150: print(" ⏩ Skipping page due to insufficient content.") continue extracted_info = process_content_with_gemini(page['content']) # Check if the extraction was successful and contains actual data if extracted_info and "error" not in extracted_info: if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found": print(f" ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}") # Add the source URL for reference extracted_info['source_url'] = page['url'] all_extracted_deals.append(extracted_info) else: print(" ⚪ No relevant deals found on this page.") # Add a small delay to respect API rate limits (1 second is safe) time.sleep(1) if all_extracted_deals: save_extracted_data(OUTPUT_FILE, all_extracted_deals) else: print("\nNo relevant deals were extracted from any of the pages.") if __name__ == "__main__": main()