diff --git a/docker/crawler/.gitignore b/docker/crawler/.gitignore new file mode 100644 index 0000000..0407f89 --- /dev/null +++ b/docker/crawler/.gitignore @@ -0,0 +1,2 @@ +.env +marketline_cookies.json \ No newline at end of file diff --git a/docker/crawler/analyze.py b/docker/crawler/analyze.py new file mode 100644 index 0000000..e488e14 --- /dev/null +++ b/docker/crawler/analyze.py @@ -0,0 +1,122 @@ +import google.generativeai as genai +import json +import os +import time +from dotenv import load_dotenv +load_dotenv() + +GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") + +# json generated by the scraper (markeltine_crawler.py) +INPUT_FILE = os.path.join("crawl_results", "successful_pages.json") + +# output JSON any extracted deals from the scraped data +OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json") + +MODEL_NAME = "gemini-2.0-flash-lite" + +# TODO: refine +EXTRACTION_PROMPT = """ +From the document text provided below, extract key details about any military or arms exports. + +Your task is to identify the following: +- "company_name": The name of the company involved in manufacturing or selling. +- "weapon_system": The specific type of weapon, vehicle, or military equipment. +- "destination_country": The country receiving the goods. +- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD"). +- "summary": A concise, one-sentence summary of the export deal or report. + +If a specific piece of information cannot be found in the text, you MUST use the value "Not Found". + +Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON. + +--- +DOCUMENT TEXT: +{text_content} +""" + +def load_scraped_data(filepath): + """Loads the scraped data from the JSON file.""" + try: + with open(filepath, "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + print(f"❌ Error: Input file not found at '{filepath}'.") + print("Ensure you have run the scraper first.") + return None + +def save_extracted_data(filepath, data): + """Saves the final extracted data to a new JSON file.""" + with open(filepath, "w", encoding="utf-8") as f: + json.dump(data, f, indent=4, ensure_ascii=False) + print(f"\n✅ Success! Saved extracted info to '{filepath}'.") + + +def process_content_with_gemini(text_content): + """ + Sends the text to the Gemini API with the extraction prompt and + parses the JSON response. + """ + model = genai.GenerativeModel(MODEL_NAME) + prompt = EXTRACTION_PROMPT.format(text_content=text_content) + + try: + response = model.generate_content(prompt) + # Clean the response to ensure it's valid JSON. Gemini sometimes + # wraps its JSON response in markdown backticks. + clean_json = response.text.strip().replace("```json", "").replace("```", "") + # print("GOT: ", clean_json) + return json.loads(clean_json) + except Exception as e: + print(f" ❌ An error occurred while calling Gemini or parsing its response: {e}") + return {"error": str(e)} + + +def main(): + """Main function to run the data extraction process.""" + if not GOOGLE_API_KEY: + print("❌ Error: GOOGLE_API_KEY environment variable not set.") + return + + genai.configure(api_key=GOOGLE_API_KEY) + + scraped_pages = load_scraped_data(INPUT_FILE) + if not scraped_pages: + print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json") + return + + all_extracted_deals = [] + total_pages = len(scraped_pages) + + print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...") + + for i, page in enumerate(scraped_pages): + print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}") + + # Avoid processing pages with very little text + if len(page.get('content', '')) < 150: + print(" ⏩ Skipping page due to insufficient content.") + continue + + extracted_info = process_content_with_gemini(page['content']) + + # Check if the extraction was successful and contains actual data + if extracted_info and "error" not in extracted_info: + if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found": + print(f" ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}") + # Add the source URL for reference + extracted_info['source_url'] = page['url'] + all_extracted_deals.append(extracted_info) + else: + print(" ⚪ No relevant deals found on this page.") + + # Add a small delay to respect API rate limits (1 second is safe) + time.sleep(1) + + if all_extracted_deals: + save_extracted_data(OUTPUT_FILE, all_extracted_deals) + else: + print("\nNo relevant deals were extracted from any of the pages.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/crawler/check.py b/docker/crawler/check.py new file mode 100644 index 0000000..c1e3e10 --- /dev/null +++ b/docker/crawler/check.py @@ -0,0 +1,26 @@ +# a quick little side script to look into the results +# NOT used for main workflow + +import asyncio +from playwright.async_api import async_playwright +import json +import os +from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + + +# check how many pages are invalid password pages (it was not many -- like 7/100) +with open("crawl_results/successful_pages.json", "r") as f: + results = json.load(f) + +counter = 0 +total = 0 +for result in results: + total += 1 + if not "password is invalid" in result['content']: + print("\n\n\n FOUND: \n", result['content']) + counter+= 1 + +print(f"\n\n\n FINAL GOOD: {counter} OF {total} RESULTS") \ No newline at end of file diff --git a/docker/crawler/crawl_results/.gitignore b/docker/crawler/crawl_results/.gitignore new file mode 100644 index 0000000..94a2dd1 --- /dev/null +++ b/docker/crawler/crawl_results/.gitignore @@ -0,0 +1 @@ +*.json \ No newline at end of file diff --git a/docker/crawler/marketline_crawler.py b/docker/crawler/marketline_crawler.py new file mode 100644 index 0000000..17744ea --- /dev/null +++ b/docker/crawler/marketline_crawler.py @@ -0,0 +1,183 @@ +import asyncio +from playwright.async_api import async_playwright, Page +import json +import os +from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy +from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer +from crawl4ai.deep_crawling.filters import URLPatternFilter +from datetime import datetime + +# --- CONFIGURATION --- + +# TODO: this will need to change for different organizations (ie univiersities) + +# make this the link for university login when accessing marketline +LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true" + +# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. +HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home" + +# the root page to seed crawling +CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001" + +# name of file where cookies are saved +COOKIES_FILE = "marketline_cookies.json" + +# --- CRAWLER SETTINGS --- +DEPTH = 2 +COUNT = 10 # Increased for better testing + +# TODO: maybe make this list more comprehensive? +SCRAPER_KEYWORDS = [ + # Core Terms + "arms export", "arms sale", "arms trade", "weapons export", "weapons deal", + "military export", "defence contract", "defense contract", + + # Canadian Context + "canadian armed forces", "global affairs canada", "canadian defence", + "canadian military", "royal canadian navy", "royal canadian air force", + + # Equipment & Technology + "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", + "frigate", "fighter jet", "military aircraft", "surveillance", "radar", + "artillery", "munitions", "firearms", "aerospace", + + # Action & Policy Terms + "procurement", "acquisition", "military aid", "export permit", "itar" +] + +# runs login process and saves cookies so that we can run the scraping with authentication +async def login_and_save_cookies(): + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + context = await browser.new_context() + page = await context.new_page() + + try: + await page.goto(LOGIN_URL) + await page.wait_for_url(HOMEPAGE_URL, timeout=300000) + + print("Login detected. Saving session cookies...") + cookies = await context.cookies() + with open(COOKIES_FILE, "w") as f: + json.dump(cookies, f) + + print("Cookies saved successfully!") + await crawl_with_saved_cookies() + + except Exception as e: + print(f"Login failed: {e}") + print("Error details:") + print(await page.content()) + + finally: + await context.close() + await browser.close() + +def save_results_to_json(successful_data, failed_pages): + """ + Saves the successful and failed crawl results into separate JSON files + in a dedicated directory. + """ + output_dir = "crawl_results" + os.makedirs(output_dir, exist_ok=True) + print(f"\n💾 Saving results to '{output_dir}' directory...") + + # Define file paths + successful_file = os.path.join(output_dir, "successful_pages.json") + failed_file = os.path.join(output_dir, "failed_pages.json") + + # Save successfully scraped data + with open(successful_file, "w", encoding="utf-8") as f: + json.dump(successful_data, f, indent=4, ensure_ascii=False) + print(f" Saved data for {len(successful_data)} successful pages to '{successful_file}'") + + # Save failed pages if any + if failed_pages: + with open(failed_file, "w", encoding="utf-8") as f: + json.dump(failed_pages, f, indent=4, ensure_ascii=False) + print(f" Saved info for {len(failed_pages)} failed pages to '{failed_file}'") + + +# runs the crawler with the cookies collected during login +async def crawl_with_saved_cookies(): + + if not os.path.exists(COOKIES_FILE): + print("No cookies found. Please run login first.") + return + + with open(COOKIES_FILE, "r") as f: + cookies = json.load(f) + + browser_config = BrowserConfig(cookies=cookies) + + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=DEPTH, + max_pages=COUNT, + url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,), + ), + scraping_strategy=LXMLWebScrapingStrategy(), + # TODO: scrape the PDFs better + # scraping_strategy=PDFCrawlerStrategy(), + verbose=True, + stream=True, + page_timeout=30000 + ) + + successful_data = [] + failed_pages = [] + + async with AsyncWebCrawler(config=browser_config) as crawler: + async for result in await crawler.arun(CRAWLPAGE_URL, config=config): + if result.success: + depth = result.metadata.get("depth", 0) + score = result.metadata.get("score", 0) + + # here we could look at a few things, the HTML, markdown, raw text, etc. + scraped_content = result.markdown + + print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}") + # NEW: Print a preview of the content to confirm it's being scraped + print(f" 📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...") + + successful_data.append({ + "url": result.url, + "content": scraped_content, + "depth": depth, + "score": round(score, 2) + }) + else: + failed_pages.append({ + 'url': result.url, + 'error': result.error_message, + 'depth': result.metadata.get("depth", 0) + }) + print(f"❌ Failed: {result.url} - {result.error_message}") + + print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed") + + save_results_to_json(successful_data, failed_pages) + + # Analyze failures by depth + if failed_pages: + failure_by_depth = {} + for failure in failed_pages: + depth = failure['depth'] + failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1 + + print("❌ Failures by depth:") + for depth, count in sorted(failure_by_depth.items()): + print(f" Depth {depth}: {count} failures") + +if __name__ == "__main__": + # Choose which function to run + # 1. First, run the login function once to get your cookies + asyncio.run(login_and_save_cookies()) + + # 2. Then, comment out the login line and run the crawl + # asyncio.run(crawl_with_saved_cookies()) \ No newline at end of file diff --git a/docker/crawler/requirements.txt b/docker/crawler/requirements.txt new file mode 100644 index 0000000..9f3862b --- /dev/null +++ b/docker/crawler/requirements.txt @@ -0,0 +1,29 @@ +beautifulsoup4==4.13.4 +cachetools==5.5.2 +certifi==2025.7.14 +charset-normalizer==3.4.2 +dotenv==0.9.9 +google==3.0.0 +google-ai-generativelanguage==0.1.0 +google-api-core==2.25.1 +google-api-python-client==2.177.0 +google-auth==2.40.3 +google-auth-httplib2==0.2.0 +google-generativeai==0.1.0rc1 +googleapis-common-protos==1.70.0 +grpcio==1.70.0 +grpcio-status==1.62.3 +httplib2==0.22.0 +idna==3.10 +proto-plus==1.26.1 +protobuf==4.25.8 +pyasn1==0.6.1 +pyasn1-modules==0.4.2 +pyparsing==3.1.4 +python-dotenv==1.0.1 +requests==2.32.4 +rsa==4.9.1 +soupsieve==2.7 +typing-extensions==4.13.2 +uritemplate==4.1.1 +urllib3==2.2.3