diff --git a/docker/crawler-google-alerts/fetch_site.py b/docker/crawler-google-alerts/fetch_site.py deleted file mode 100644 index a902041..0000000 --- a/docker/crawler-google-alerts/fetch_site.py +++ /dev/null @@ -1,61 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright -from bs4 import BeautifulSoup - -async def fetch_site(url: str) -> str | None: - """ - Fetches the main article text of a URL using Playwright and BeautifulSoup. - - Args: - url: The URL of the website to fetch. - - Returns: - A string containing the main text content of the page, or None on error. - """ - print(f"fetching {url}") - async with async_playwright() as p: - browser = await p.chromium.launch() - page = await browser.new_page() - - try: - await page.goto(url, wait_until='domcontentloaded', timeout=60000) - - content = await page.content() - soup = BeautifulSoup(content, 'html.parser') - - # Strategy: Find the main content container - # First, try to find a
tag. If not, look for an
tag. - # You can add more fallbacks based on common website structures, - # e.g., soup.find('div', id='content') - main_content = soup.find('main') - if not main_content: - main_content = soup.find('article') - - # If a main content area is found, extract text from it. - if main_content: - - # (Optional) Remove unwanted elements like scripts or ads from within the main content - for element in main_content(['script', 'style', 'aside']): # type: ignore - element.decompose() - - print(f"SUCCESSFUL FETCH: {url}") - # .get_text() with separator and strip for cleaner output - return main_content.get_text(separator='\n', strip=True) - else: - # Fallback if no specific container is found (less reliable) - print("WARNING: No main content container found. Falling back to body.") - if soup.body: - body_text = soup.body.get_text(separator='\n', strip=True) - print(f"SUCCESSFUL FETCH: {url}") - return body_text - - except Exception as e: - print(f"FAILED FETCH: {url}") - print(f"An error occurred: {e}") - return None - - finally: - await browser.close() - -# Example usage: -# asyncio.run(fetch_site("https://www.example.com")) \ No newline at end of file diff --git a/docker/crawler-google-alerts/get_all_feed_contents.py b/docker/crawler-google-alerts/get_all_feed_contents.py index 2d32fed..a3763b8 100644 --- a/docker/crawler-google-alerts/get_all_feed_contents.py +++ b/docker/crawler-google-alerts/get_all_feed_contents.py @@ -1,11 +1,160 @@ import asyncio +import csv +from dataclasses import dataclass +import os from typing import Dict, List, Tuple -from clean_string import clean_string -from fetch_site import fetch_site -from get_feeds import get_feeds -from get_links_from_feed import Alert, get_links_from_feed -import xml.etree.ElementTree as ET +import feedparser +import urllib.parse +import requests +from clean_string import clean_string +import xml.etree.ElementTree as ET +from playwright.async_api import async_playwright +from bs4 import BeautifulSoup +from seed_with_csv import seed_with_csv + +@dataclass +class Alert: + """A simple data class to hold information about a single alert.""" + title: str + url: str + summary: str + +def get_links_from_feed(rss_url: str) -> list[Alert]: + """ + Parses a Google Alerts RSS feed URL and extracts the data for each alert. + + Args: + rss_url: The URL of the Google Alerts RSS feed. + + Returns: + A list of Alert objects. Returns an empty list if the feed + cannot be parsed or is empty. + """ + alerts: list[Alert] = [] + # Parse the RSS feed from the provided URL + feed = feedparser.parse(rss_url) + + # Check if the feed was parsed successfully and has entries + if feed.bozo: + print(f"Error parsing feed: {feed.bozo_exception}") + return alerts + + # Iterate over each entry in the feed + for entry in feed.entries: + # The title is directly available + title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore + title = title_soup.get_text() + + # The summary often contains HTML, so we parse it to get clean text. + summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore + summary = summary_soup.get_text() + + # The link is a Google redirect URL; we extract the 'url' parameter. + link = entry.link + + try: + # Parse the URL to easily access its components + parsed_url = urllib.parse.urlparse(link) #type: ignore + # Get the query parameters as a dictionary + query_params = urllib.parse.parse_qs(parsed_url.query) + # The actual destination URL is in the 'url' parameter + actual_url = query_params.get('url', [None])[0] + + if actual_url: + # Append an Alert object instead of a tuple + alert_obj = Alert(title=title, url=actual_url, summary=summary) + alerts.append(alert_obj) + except Exception as e: + print(f"Could not parse URL for entry '{title}': {e}") + + return alerts + +def get_feeds() -> list[tuple[str, str]]: + """Reads feed names and URLs from a local CSV file. + + This function opens 'feeds.csv', which is expected to be in the + same directory as this script. The CSV must have two columns: + the first for the feed name and the second for the URL. + + Returns: + list[tuple[str, str]]: A list of tuples, where each tuple + contains a feed's name and its URL. + """ + feeds = [] + file_path = os.path.join(os.path.dirname(__file__), "feeds.csv") + + with open(file_path, mode="r", newline="", encoding="utf-8") as f: + reader = csv.reader(f) + # If your CSV has a header row, uncomment the next line to skip it + # next(reader, None) + for row in reader: + # Ensure the row has exactly two columns to avoid errors + if len(row) == 2: + feeds.append((row[0], row[1])) + + return feeds + +async def fetch_site(url: str) -> str | None: + """ + Fetches the main article text of a URL using Playwright and BeautifulSoup. + + Args: + url: The URL of the website to fetch. + + Returns: + A string containing the main text content of the page, or None on error. + """ + print(f"fetching {url}") + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + try: + await page.goto(url, wait_until='domcontentloaded', timeout=60000) + + content = await page.content() + soup = BeautifulSoup(content, 'html.parser') + + # Strategy: Find the main content container + # First, try to find a
tag. If not, look for an
tag. + # You can add more fallbacks based on common website structures, + # e.g., soup.find('div', id='content') + main_content = soup.find('main') + if not main_content: + main_content = soup.find('article') + + # If a main content area is found, extract text from it. + if main_content: + + # (Optional) Remove unwanted elements like scripts or ads from within the main content + for element in main_content(['script', 'style', 'aside']): # type: ignore + element.decompose() + + main_text = main_content.get_text(separator='\n', strip=True) + main_text = clean_string(main_text) + + print(f"SUCCESSFUL FETCH: {url}") + print(f"FETCH CONTENT: {main_text[:140]}...") + # .get_text() with separator and strip for cleaner output + return main_text + else: + # Fallback if no specific container is found (less reliable) + print("WARNING: No main content container found. Falling back to body.") + if soup.body: + body_text = soup.body.get_text(separator='\n', strip=True) + body_text = clean_string(body_text) + print(f"SUCCESSFUL FETCH: {url}") + print(f"FETCH CONTENT: {body_text[:140]}...") + return body_text + + except Exception as e: + print(f"FAILED FETCH: {url}") + print(f"An error occurred: {e}") + return None + + finally: + await browser.close() async def get_all_feed_contents() -> List[Dict[str, str]]: """ @@ -39,54 +188,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]: if content: pages.append({ "url": url, - "content": clean_string(content) + "content": content }) print(f"\nSuccessfully fetched {len(pages)} webpages.") - return pages - -def write_to_xml(pages: List[Dict[str, str]], filename: str) -> None: - """ - Writes a list of page data to an XML file. - - The XML structure will be: - - - http://... - ... - - ... - - - Args: - pages (List[Dict[str, str]]): The list of page data to write. - filename (str): The name of the output XML file. - """ - root = ET.Element("pages") - - for page_data in pages: - page_element = ET.SubElement(root, "page") - url_element = ET.SubElement(page_element, "url") - url_element.text = page_data.get("url") - content_element = ET.SubElement(page_element, "content") - content_element.text = page_data.get("content") - - tree = ET.ElementTree(root) - # The 'xml_declaration' and 'encoding' arguments ensure it's a well-formed XML file. - tree.write(filename, encoding='utf-8', xml_declaration=True) - print(f"Data successfully written to {filename}") - - -async def main() -> None: - """ - Main entry point for the script. - """ - all_pages = await get_all_feed_contents() - if all_pages: - write_to_xml(all_pages, "feed_contents.xml") - else: - print("No pages were fetched. XML file not created.") - - -if __name__ == "__main__": - asyncio.run(main()) + return pages \ No newline at end of file diff --git a/docker/crawler-google-alerts/get_feeds.py b/docker/crawler-google-alerts/get_feeds.py deleted file mode 100644 index bb26186..0000000 --- a/docker/crawler-google-alerts/get_feeds.py +++ /dev/null @@ -1,27 +0,0 @@ -import csv -import os - -def get_feeds() -> list[tuple[str, str]]: - """Reads feed names and URLs from a local CSV file. - - This function opens 'feeds.csv', which is expected to be in the - same directory as this script. The CSV must have two columns: - the first for the feed name and the second for the URL. - - Returns: - list[tuple[str, str]]: A list of tuples, where each tuple - contains a feed's name and its URL. - """ - feeds = [] - file_path = os.path.join(os.path.dirname(__file__), "feeds.csv") - - with open(file_path, mode="r", newline="", encoding="utf-8") as f: - reader = csv.reader(f) - # If your CSV has a header row, uncomment the next line to skip it - # next(reader, None) - for row in reader: - # Ensure the row has exactly two columns to avoid errors - if len(row) == 2: - feeds.append((row[0], row[1])) - - return feeds \ No newline at end of file diff --git a/docker/crawler-google-alerts/get_links_from_feed.py b/docker/crawler-google-alerts/get_links_from_feed.py deleted file mode 100644 index ce8652e..0000000 --- a/docker/crawler-google-alerts/get_links_from_feed.py +++ /dev/null @@ -1,61 +0,0 @@ -from dataclasses import dataclass -from bs4 import BeautifulSoup -import feedparser -import urllib.parse - -@dataclass -class Alert: - """A simple data class to hold information about a single alert.""" - title: str - url: str - summary: str - -def get_links_from_feed(rss_url: str) -> list[Alert]: - """ - Parses a Google Alerts RSS feed URL and extracts the data for each alert. - - Args: - rss_url: The URL of the Google Alerts RSS feed. - - Returns: - A list of Alert objects. Returns an empty list if the feed - cannot be parsed or is empty. - """ - alerts: list[Alert] = [] - # Parse the RSS feed from the provided URL - feed = feedparser.parse(rss_url) - - # Check if the feed was parsed successfully and has entries - if feed.bozo: - print(f"Error parsing feed: {feed.bozo_exception}") - return alerts - - # Iterate over each entry in the feed - for entry in feed.entries: - # The title is directly available - title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore - title = title_soup.get_text() - - # The summary often contains HTML, so we parse it to get clean text. - summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore - summary = summary_soup.get_text() - - # The link is a Google redirect URL; we extract the 'url' parameter. - link = entry.link - - try: - # Parse the URL to easily access its components - parsed_url = urllib.parse.urlparse(link) #type: ignore - # Get the query parameters as a dictionary - query_params = urllib.parse.parse_qs(parsed_url.query) - # The actual destination URL is in the 'url' parameter - actual_url = query_params.get('url', [None])[0] - - if actual_url: - # Append an Alert object instead of a tuple - alert_obj = Alert(title=title, url=actual_url, summary=summary) - alerts.append(alert_obj) - except Exception as e: - print(f"Could not parse URL for entry '{title}': {e}") - - return alerts \ No newline at end of file diff --git a/docker/crawler-google-alerts/main.py b/docker/crawler-google-alerts/main.py index aca0b5b..16dfbb6 100644 --- a/docker/crawler-google-alerts/main.py +++ b/docker/crawler-google-alerts/main.py @@ -239,7 +239,7 @@ async def main(): # If model says 'none', skip by default (these are the irrelevant ones like US missile contracts) if relevance == "none": - print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", explanation[:200]) + print(" ⚪ Skipping — model marked this as non-Canadian. Explanation:", tx) continue # basic required-field check (we want the API-required fields present) @@ -250,7 +250,7 @@ async def main(): # Optionally normalize some fields (convert "amount" to a canonical string) - keep simple for now # Save the item all_extracted_deals.append(tx) - print(f" ✔️ Kept transaction: {tx.get('company_division')} → {tx.get('recipient')} ({relevance})") # type: ignore + print(f" ✔️ Kept transaction: {tx}") # type: ignore # Respect rate limit time.sleep(1) diff --git a/docker/crawler-google-alerts/seed_with_csv.py b/docker/crawler-google-alerts/seed_with_csv.py new file mode 100644 index 0000000..b8056ff --- /dev/null +++ b/docker/crawler-google-alerts/seed_with_csv.py @@ -0,0 +1,38 @@ +import csv +import os + +import requests + + +def seed_with_csv(): + print("reading csv...") + file_path = os.path.join(os.path.dirname(__file__), "feeds.csv") + + has_written = False + + with open(file_path, mode="r", newline="", encoding="utf-8") as f: + reader = csv.reader(f) + # If your CSV has a header row, uncomment the next line to skip it + # next(reader, None) + for row in reader: + print("reading row...") + # Ensure the row has exactly two columns to avoid errors + if len(row) == 2: + try: + res = requests.post("http://ploughshares.nixc.us/api/source", headers={ + "Content-Type": "application/json" + }, json={ + "title":row[0], + "link":row[1], + "type":"Google Alert", + }) + if "error" not in res.json(): + has_written = True + except Exception as e: + print(e) + else: + print("row has incorrect length.") + return has_written + +if __name__ == "__main__": + seed_with_csv() \ No newline at end of file diff --git a/docker/ploughshares/app.py b/docker/ploughshares/app.py index 342ef8c..a40dd84 100644 --- a/docker/ploughshares/app.py +++ b/docker/ploughshares/app.py @@ -990,8 +990,73 @@ def view_sources(): return render_template('view_sources.html', sources=sources, version=VERSION) +@app.route('/api/sources', methods=['GET']) +def get_all_sources(): + """API endpoint to get all sources""" + conn = get_db_connection() + if conn is None: + return jsonify({"error": "Database connection error"}), 500 + + sources = [] + try: + with conn.cursor() as cur: + cur.execute('SELECT * FROM sources ORDER BY src_id DESC') + sources = cur.fetchall() + + # Convert transactions to a list of dictionaries + result = list(map(lambda src: dict(src), sources)) + + except Exception as e: + logger.error(f"Database error in API: {e}") + return jsonify({"error": f"Database error: {str(e)}"}), 500 + finally: + conn.close() + + return jsonify(result) + @app.route('/api/source', methods=['POST']) def api_create_source(): + """API endpoint to create a source""" + data = request.json + + # Validate required fields + required_fields = ['title', 'link', 'type'] + for field in required_fields: + if field not in data or not data[field]: + return jsonify({"error": f"Missing required field: {field}"}), 400 + + conn = get_db_connection() + if conn is None: + return jsonify({"error": "Database connection error"}), 500 + + try: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO sources ( + title, link, type + ) VALUES ( + %(title)s, %(link)s, %(type)s + ) RETURNING src_id + """, + { + 'title': data['title'], + 'link': data['link'], + 'type': data['type'] + } + ) + result = cur.fetchone() + if result and 'src_id' in result: + conn.commit() + return jsonify({"message": "POST success!"}), 200 + except Exception as e: + logger.error(f"Error creating source via API: {e}") + return jsonify({"error": e}), 400 + finally: + conn.close() + +@app.route('/source/add', methods=['POST']) +def create_source(): """API endpoint to create a source""" data = request.form.to_dict() diff --git a/docker/ploughshares/templates/view_sources.html b/docker/ploughshares/templates/view_sources.html index f29f1ba..05be267 100644 --- a/docker/ploughshares/templates/view_sources.html +++ b/docker/ploughshares/templates/view_sources.html @@ -9,7 +9,7 @@

Sources

-
+

Add Source