From 58e2d763c62e4ac11cf579b2511ed9bb95b70a3e Mon Sep 17 00:00:00 2001 From: jChenvan <188939308+jChenvan@users.noreply.github.com> Date: Sat, 6 Sep 2025 11:19:40 -0400 Subject: [PATCH] Dorks crawler fetches rss feeds from db --- .../get_all_feed_contents.py | 27 ++++++++++++++++--- .../ploughshares/templates/view_sources.html | 2 ++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docker/crawler-google-alerts/get_all_feed_contents.py b/docker/crawler-google-alerts/get_all_feed_contents.py index eb72fdb..2495c89 100644 --- a/docker/crawler-google-alerts/get_all_feed_contents.py +++ b/docker/crawler-google-alerts/get_all_feed_contents.py @@ -12,7 +12,6 @@ from clean_string import clean_string import xml.etree.ElementTree as ET from playwright.async_api import async_playwright from bs4 import BeautifulSoup -from seed_with_csv import seed_with_csv @dataclass class Alert: @@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]: return alerts -def get_feeds() -> list[tuple[str, str]]: + +def get_feeds_from_db() -> list[tuple[str, str]]: + """ + Returns: + list[tuple[str, str]]: A list of tuples, where each tuple + contains a feed's name and its URL. + """ + + print("fetching feeds from db...") + + response = requests.get("https://ploughshares.nixc.us/api/sources") + + sources = response.json() + + alerts = filter(lambda src: src.get("type","") == "Google Alert", sources) + + result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts)) + + return result + + +def get_feeds_from_csv() -> list[tuple[str, str]]: """Reads feed names and URLs from a local CSV file. This function opens 'feeds.csv', which is expected to be in the @@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]: List[Dict[str, str]]: A list of dictionaries, where each dictionary contains the 'url' and its cleaned 'content'. """ - feeds: List[Tuple[str, str]] = get_feeds() + #feeds: List[Tuple[str, str]] = get_feeds_from_csv() + feeds: List[Tuple[str, str]] = get_feeds_from_db() urls: List[str] = [] for keyword, feed in feeds: diff --git a/docker/ploughshares/templates/view_sources.html b/docker/ploughshares/templates/view_sources.html index 05be267..f600673 100644 --- a/docker/ploughshares/templates/view_sources.html +++ b/docker/ploughshares/templates/view_sources.html @@ -11,6 +11,8 @@