added crawler

2025-08-06 19:01:58 -04:00 · 2025-08-06 19:01:58 -04:00 · f06d01613f
parent 9dea0bac65
commit f06d01613f
6 changed files with 363 additions and 0 deletions
--- a/docker/crawler/.gitignore
+++ b/docker/crawler/.gitignore
@ -0,0 +1,2 @@
+.env
+marketline_cookies.json
--- a/docker/crawler/analyze.py
+++ b/docker/crawler/analyze.py
@ -0,0 +1,122 @@
+import google.generativeai as genai
+import json
+import os
+import time
+from dotenv import load_dotenv
+load_dotenv()
+
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+
+# json generated by the scraper (markeltine_crawler.py)
+INPUT_FILE = os.path.join("crawl_results", "successful_pages.json")
+
+# output JSON any extracted deals from the scraped data
+OUTPUT_FILE = os.path.join("crawl_results", "extracted_arms_deals.json")
+
+MODEL_NAME = "gemini-2.0-flash-lite"
+
+# TODO: refine
+EXTRACTION_PROMPT = """
+From the document text provided below, extract key details about any military or arms exports.
+
+Your task is to identify the following:
+- "company_name": The name of the company involved in manufacturing or selling.
+- "weapon_system": The specific type of weapon, vehicle, or military equipment.
+- "destination_country": The country receiving the goods.
+- "sale_value": The monetary value of the deal, including currency (e.g., "$15 Billion CAD").
+- "summary": A concise, one-sentence summary of the export deal or report.
+
+If a specific piece of information cannot be found in the text, you MUST use the value "Not Found".
+
+Provide your response as a single, clean JSON object. Do not add any explanatory text before or after the JSON.
+
+---
+DOCUMENT TEXT:
+{text_content}
+"""
+
+def load_scraped_data(filepath):
+    """Loads the scraped data from the JSON file."""
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"❌ Error: Input file not found at '{filepath}'.")
+        print("Ensure you have run the scraper first.")
+        return None
+
+def save_extracted_data(filepath, data):
+    """Saves the final extracted data to a new JSON file."""
+    with open(filepath, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+    print(f"\n✅ Success! Saved extracted info to '{filepath}'.")
+
+
+def process_content_with_gemini(text_content):
+    """
+    Sends the text to the Gemini API with the extraction prompt and
+    parses the JSON response.
+    """
+    model = genai.GenerativeModel(MODEL_NAME)
+    prompt = EXTRACTION_PROMPT.format(text_content=text_content)
+
+    try:
+        response = model.generate_content(prompt)
+        # Clean the response to ensure it's valid JSON. Gemini sometimes
+        # wraps its JSON response in markdown backticks.
+        clean_json = response.text.strip().replace("```json", "").replace("```", "")
+        # print("GOT: ", clean_json)
+        return json.loads(clean_json)
+    except Exception as e:
+        print(f"   ❌ An error occurred while calling Gemini or parsing its response: {e}")
+        return {"error": str(e)}
+
+
+def main():
+    """Main function to run the data extraction process."""
+    if not GOOGLE_API_KEY:
+        print("❌ Error: GOOGLE_API_KEY environment variable not set.")
+        return
+
+    genai.configure(api_key=GOOGLE_API_KEY)
+
+    scraped_pages = load_scraped_data(INPUT_FILE)
+    if not scraped_pages:
+        print("❌ Error: No scraper results found. Run marketline_crawler.py to generate crawl_results/successful_pages.json")
+        return
+
+    all_extracted_deals = []
+    total_pages = len(scraped_pages)
+
+    print(f"🤖 Starting information extraction with Gemini for {total_pages} pages...")
+
+    for i, page in enumerate(scraped_pages):
+        print(f"\nProcessing page {i+1}/{total_pages}: {page['url']}")
+
+        # Avoid processing pages with very little text
+        if len(page.get('content', '')) < 150:
+            print("   ⏩ Skipping page due to insufficient content.")
+            continue
+
+        extracted_info = process_content_with_gemini(page['content'])
+        
+        # Check if the extraction was successful and contains actual data
+        if extracted_info and "error" not in extracted_info:
+            if extracted_info.get("company_name") != "Not Found" or extracted_info.get("weapon_system") != "Not Found":
+                print(f"   ✔️ Found relevant info: {extracted_info.get('company_name', 'N/A')} | {extracted_info.get('weapon_system', 'N/A')}")
+                # Add the source URL for reference
+                extracted_info['source_url'] = page['url']
+                all_extracted_deals.append(extracted_info)
+            else:
+                 print("   ⚪ No relevant deals found on this page.")
+        
+        # Add a small delay to respect API rate limits (1 second is safe)
+        time.sleep(1)
+
+    if all_extracted_deals:
+        save_extracted_data(OUTPUT_FILE, all_extracted_deals)
+    else:
+        print("\nNo relevant deals were extracted from any of the pages.")
+
+if __name__ == "__main__":
+    main()
--- a/docker/crawler/check.py
+++ b/docker/crawler/check.py
@ -0,0 +1,26 @@
+# a quick little side script to look into the results
+# NOT used for main workflow
+
+import asyncio
+from playwright.async_api import async_playwright
+import json
+import os
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+
+# check how many pages are invalid password pages (it was not many -- like 7/100)
+with open("crawl_results/successful_pages.json", "r") as f:
+    results = json.load(f)
+    
+counter = 0
+total = 0
+for result in results:
+    total += 1
+    if not "password is invalid" in result['content']:
+        print("\n\n\n FOUND: \n", result['content'])
+        counter+= 1
+
+print(f"\n\n\n FINAL GOOD: {counter} OF {total} RESULTS")
--- a/docker/crawler/crawl_results/.gitignore
+++ b/docker/crawler/crawl_results/.gitignore
@ -0,0 +1 @@
+*.json
--- a/docker/crawler/marketline_crawler.py
+++ b/docker/crawler/marketline_crawler.py
@ -0,0 +1,183 @@
+import asyncio
+from playwright.async_api import async_playwright, Page
+import json
+import os
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import ContentScrapingStrategy, ScrapingResult, LXMLWebScrapingStrategy
+from crawl4ai.processors.pdf import PDFContentScrapingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling.filters import URLPatternFilter
+from datetime import datetime
+
+# --- CONFIGURATION ---
+
+# TODO: this will need to change for different organizations (ie univiersities)
+
+# make this the link for university login when accessing marketline
+LOGIN_URL = "https://login.microsoftonline.com/be62a12b-2cad-49a1-a5fa-85f4f3156a7d/saml2?SAMLRequest=fZLBbtswEER%2FReBdokhJjk1YBtz4UANpasRODrkUK2plE6BIlUsl7d9Xtls0ufhIcPhmZ5ZLgt4Oaj3Gk3vCnyNSTH711pG6XNRsDE55IEPKQY%2Bkolb79bcHJbNcDcFHr71lyZoIQzTe3XtHY49hj%2BHNaHx%2BeqjZKcaBFOcwmWTWNNnojyPa4ZRp4NP5%2BE4D359M03iLk4TI87OH5Lvv%2BwNLNtNQxsEZ%2Fx9m%2FdG4rDc6ePJd9M4ah5n2PW9wJkHIJpUa2rRcgEih6iCdV13ZFaKawV3Lz%2BkkS7abmv1Y6Hne5lgVQhSFkPlMgABcdHPddvOmnE0yohG3jiK4WDOZyyrN71K5OEipylIV8pUlu79lfDGuNe54u7nmKiL19XDYpdeYLxjoEnESsNXyPKG6GIcPG7mNhX9rYKubpdOw5B%2F4V7NBPU7A7WbnrdG%2Fk7W1%2Fv0%2BIESsmWB8dX3y%2Ba%2Bs%2FgA%3D&RelayState=https%3A%2F%2Fauth.lib.uoguelph.ca%2Fopenathens%2Fsaml%2F%3Fuuid%3Db3nuk1o5lh78w6j657yd773oxfeqzc0v%26csrfmiddlewaretoken%3D4EzWMhPgP6L5YXtK3FGIgKKQ5KguVDwOuod2abzLQRV6kagUu0BBVWsJVI8N78tT%26opshib%3DLogin%2Bwith%2Byour%2BGryphmail%2BPassword%26staff_mode%3DTrue&sso_reload=true"
+
+# shouldnt need to change. this is what we will wait for to load after logging in to trigger saving cookies. 
+HOMEPAGE_URL = "https://advantage.marketline.com/HomePage/Home"
+
+# the root page to seed crawling
+CRAWLPAGE_URL = "https://advantage.marketline.com/Search?industry=2800001"
+
+# name of file where cookies are saved
+COOKIES_FILE = "marketline_cookies.json"
+
+# --- CRAWLER SETTINGS ---
+DEPTH = 2
+COUNT = 10 # Increased for better testing
+
+# TODO: maybe make this list more comprehensive? 
+SCRAPER_KEYWORDS = [
+    # Core Terms
+    "arms export", "arms sale", "arms trade", "weapons export", "weapons deal",
+    "military export", "defence contract", "defense contract",
+    
+    # Canadian Context
+    "canadian armed forces", "global affairs canada", "canadian defence", 
+    "canadian military", "royal canadian navy", "royal canadian air force",
+    
+    # Equipment & Technology
+    "armoured vehicle", "light armoured vehicle", "lav", "naval ship", "warship", 
+    "frigate", "fighter jet", "military aircraft", "surveillance", "radar", 
+    "artillery", "munitions", "firearms", "aerospace",
+    
+    # Action & Policy Terms
+    "procurement", "acquisition", "military aid", "export permit", "itar"
+]
+
+# runs login process and saves cookies so that we can run the scraping with authentication
+async def login_and_save_cookies():
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context()
+        page = await context.new_page()
+
+        try:
+            await page.goto(LOGIN_URL)
+            await page.wait_for_url(HOMEPAGE_URL, timeout=300000)
+
+            print("Login detected. Saving session cookies...")
+            cookies = await context.cookies()
+            with open(COOKIES_FILE, "w") as f:
+                json.dump(cookies, f)
+
+            print("Cookies saved successfully!")
+            await crawl_with_saved_cookies()
+
+        except Exception as e:
+            print(f"Login failed: {e}")
+            print("Error details:")
+            print(await page.content())
+
+        finally:
+            await context.close()
+            await browser.close()
+
+def save_results_to_json(successful_data, failed_pages):
+    """
+    Saves the successful and failed crawl results into separate JSON files
+    in a dedicated directory.
+    """
+    output_dir = "crawl_results"
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"\n💾 Saving results to '{output_dir}' directory...")
+
+    # Define file paths
+    successful_file = os.path.join(output_dir, "successful_pages.json")
+    failed_file = os.path.join(output_dir, "failed_pages.json")
+
+    # Save successfully scraped data
+    with open(successful_file, "w", encoding="utf-8") as f:
+        json.dump(successful_data, f, indent=4, ensure_ascii=False)
+    print(f"   Saved data for {len(successful_data)} successful pages to '{successful_file}'")
+
+    # Save failed pages if any
+    if failed_pages:
+        with open(failed_file, "w", encoding="utf-8") as f:
+            json.dump(failed_pages, f, indent=4, ensure_ascii=False)
+        print(f"   Saved info for {len(failed_pages)} failed pages to '{failed_file}'")
+
+
+# runs the crawler with the cookies collected during login
+async def crawl_with_saved_cookies():
+
+    if not os.path.exists(COOKIES_FILE):
+        print("No cookies found. Please run login first.")
+        return
+
+    with open(COOKIES_FILE, "r") as f:
+        cookies = json.load(f)
+
+    browser_config = BrowserConfig(cookies=cookies)
+
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=DEPTH,
+            max_pages=COUNT,
+            url_scorer=KeywordRelevanceScorer(keywords=SCRAPER_KEYWORDS,),
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        # TODO: scrape the PDFs better
+        # scraping_strategy=PDFCrawlerStrategy(),
+        verbose=True,
+        stream=True,
+        page_timeout=30000
+    )
+    
+    successful_data = []
+    failed_pages = []
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        async for result in await crawler.arun(CRAWLPAGE_URL, config=config):
+            if result.success:
+                depth = result.metadata.get("depth", 0)
+                score = result.metadata.get("score", 0)
+                
+                # here we could look at a few things, the HTML, markdown, raw text, etc. 
+                scraped_content = result.markdown
+                
+                print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
+                # NEW: Print a preview of the content to confirm it's being scraped
+                print(f"   📄 Content length: {len(scraped_content)}. Preview: {scraped_content[:120]}...")
+
+                successful_data.append({
+                    "url": result.url,
+                    "content": scraped_content,
+                    "depth": depth,
+                    "score": round(score, 2)
+                })
+            else:
+                failed_pages.append({
+                    'url': result.url,
+                    'error': result.error_message,
+                    'depth': result.metadata.get("depth", 0)
+                })
+                print(f"❌ Failed: {result.url} - {result.error_message}")
+    
+    print(f"📊 Results: {len(successful_data)} successful, {len(failed_pages)} failed")
+
+    save_results_to_json(successful_data, failed_pages)
+    
+    # Analyze failures by depth
+    if failed_pages:
+        failure_by_depth = {}
+        for failure in failed_pages:
+            depth = failure['depth']
+            failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
+        
+        print("❌ Failures by depth:")
+        for depth, count in sorted(failure_by_depth.items()):
+            print(f"   Depth {depth}: {count} failures")
+
+if __name__ == "__main__":
+    # Choose which function to run
+    # 1. First, run the login function once to get your cookies
+    asyncio.run(login_and_save_cookies())
+    
+    # 2. Then, comment out the login line and run the crawl
+    # asyncio.run(crawl_with_saved_cookies())
--- a/docker/crawler/requirements.txt
+++ b/docker/crawler/requirements.txt
@ -0,0 +1,29 @@
+beautifulsoup4==4.13.4
+cachetools==5.5.2
+certifi==2025.7.14
+charset-normalizer==3.4.2
+dotenv==0.9.9
+google==3.0.0
+google-ai-generativelanguage==0.1.0
+google-api-core==2.25.1
+google-api-python-client==2.177.0
+google-auth==2.40.3
+google-auth-httplib2==0.2.0
+google-generativeai==0.1.0rc1
+googleapis-common-protos==1.70.0
+grpcio==1.70.0
+grpcio-status==1.62.3
+httplib2==0.22.0
+idna==3.10
+proto-plus==1.26.1
+protobuf==4.25.8
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
+pyparsing==3.1.4
+python-dotenv==1.0.1
+requests==2.32.4
+rsa==4.9.1
+soupsieve==2.7
+typing-extensions==4.13.2
+uritemplate==4.1.1
+urllib3==2.2.3