Dorks crawler fetches rss feeds from db

2025-09-06 11:19:40 -04:00 · 2025-09-06 11:19:40 -04:00 · 58e2d763c6
parent 6fc12d50cb
commit 58e2d763c6
2 changed files with 26 additions and 3 deletions
--- a/docker/crawler-google-alerts/get_all_feed_contents.py
+++ b/docker/crawler-google-alerts/get_all_feed_contents.py
@ -12,7 +12,6 @@ from clean_string import clean_string
 import xml.etree.ElementTree as ET
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
-from seed_with_csv import seed_with_csv

@dataclass
 class Alert:
@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]:

    return alerts

-def get_feeds() -> list[tuple[str, str]]:
+
+def get_feeds_from_db() -> list[tuple[str, str]]:
+    """
+    Returns:
+        list[tuple[str, str]]: A list of tuples, where each tuple
+                               contains a feed's name and its URL.
+    """
+
+    print("fetching feeds from db...")
+
+    response = requests.get("https://ploughshares.nixc.us/api/sources")
+
+    sources = response.json()
+
+    alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)
+
+    result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))
+                
+    return result
+
+
+def get_feeds_from_csv() -> list[tuple[str, str]]:
    """Reads feed names and URLs from a local CSV file.

    This function opens 'feeds.csv', which is expected to be in the
@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
        List[Dict[str, str]]: A list of dictionaries, where each dictionary
                               contains the 'url' and its cleaned 'content'.
    """
-    feeds: List[Tuple[str, str]] = get_feeds()
+    #feeds: List[Tuple[str, str]] = get_feeds_from_csv()
+    feeds: List[Tuple[str, str]] = get_feeds_from_db()
    urls: List[str] = []

    for keyword, feed in feeds:
--- a/docker/ploughshares/templates/view_sources.html
+++ b/docker/ploughshares/templates/view_sources.html
@ -11,6 +11,8 @@
        <div class="card-body">
            <form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
                <h1>Add Source</h1>
+                <p><em>Google Alert</em> - For scraping the web using google alerts. Enter the search term in the <em>title</em> and the RSS feed url in the <em>link</em></p>
+                <p><em>Crawler Startpoint</em> - A url that the crawler will use as a start point. Enter the url in the <em>link</em>. Enter the keywords you want the crawler to hone in on in the <em>title</em>, separated by commas (eg. apple, oranges, bananas). </p>
                <label>
                    title
                    <input type="text" name="title">