Dorks crawler fetches rss feeds from db

2025-09-06 11:19:40 -04:00 · 2025-09-06 11:19:40 -04:00 · 58e2d763c6
parent 6fc12d50cb
commit 58e2d763c6
2 changed files with 26 additions and 3 deletions
--- a/docker/crawler-google-alerts/get_all_feed_contents.py
+++ b/docker/crawler-google-alerts/get_all_feed_contents.py
@ -12,7 +12,6 @@ from clean_string import clean_string
 import xml.etree.ElementTree as ET
 from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 from seed_with_csv import seed_with_csv
@dataclass
 class Alert:
@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]:
    return alerts
-def get_feeds() -> list[tuple[str, str]]:
+
 def get_feeds_from_db() -> list[tuple[str, str]]:
    """
    Returns:
        list[tuple[str, str]]: A list of tuples, where each tuple
                               contains a feed's name and its URL.
    """
    print("fetching feeds from db...")
    response = requests.get("https://ploughshares.nixc.us/api/sources")
    sources = response.json()
    alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)
    result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))
    return result
 def get_feeds_from_csv() -> list[tuple[str, str]]:
    """Reads feed names and URLs from a local CSV file.
    This function opens 'feeds.csv', which is expected to be in the
@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
        List[Dict[str, str]]: A list of dictionaries, where each dictionary
                               contains the 'url' and its cleaned 'content'.
    """
-    feeds: List[Tuple[str, str]] = get_feeds()
+    #feeds: List[Tuple[str, str]] = get_feeds_from_csv()
    feeds: List[Tuple[str, str]] = get_feeds_from_db()
    urls: List[str] = []
    for keyword, feed in feeds:
--- a/docker/ploughshares/templates/view_sources.html
+++ b/docker/ploughshares/templates/view_sources.html
@ -11,6 +11,8 @@
        <div class="card-body">
            <form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
                <h1>Add Source</h1>
                <p><em>Google Alert</em> - For scraping the web using google alerts. Enter the search term in the <em>title</em> and the RSS feed url in the <em>link</em></p>
                <p><em>Crawler Startpoint</em> - A url that the crawler will use as a start point. Enter the url in the <em>link</em>. Enter the keywords you want the crawler to hone in on in the <em>title</em>, separated by commas (eg. apple, oranges, bananas). </p>
                <label>
                    title
                    <input type="text" name="title">