Dorks crawler fetches rss feeds from db
ci/woodpecker/push/woodpecker Pipeline was successful Details

This commit is contained in:
jChenvan 2025-09-06 11:19:40 -04:00
parent 6fc12d50cb
commit 58e2d763c6
2 changed files with 26 additions and 3 deletions

View File

@ -12,7 +12,6 @@ from clean_string import clean_string
import xml.etree.ElementTree as ET
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from seed_with_csv import seed_with_csv
@dataclass
class Alert:
@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]:
return alerts
def get_feeds() -> list[tuple[str, str]]:
def get_feeds_from_db() -> list[tuple[str, str]]:
"""
Returns:
list[tuple[str, str]]: A list of tuples, where each tuple
contains a feed's name and its URL.
"""
print("fetching feeds from db...")
response = requests.get("https://ploughshares.nixc.us/api/sources")
sources = response.json()
alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)
result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))
return result
def get_feeds_from_csv() -> list[tuple[str, str]]:
"""Reads feed names and URLs from a local CSV file.
This function opens 'feeds.csv', which is expected to be in the
@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
List[Dict[str, str]]: A list of dictionaries, where each dictionary
contains the 'url' and its cleaned 'content'.
"""
feeds: List[Tuple[str, str]] = get_feeds()
#feeds: List[Tuple[str, str]] = get_feeds_from_csv()
feeds: List[Tuple[str, str]] = get_feeds_from_db()
urls: List[str] = []
for keyword, feed in feeds:

View File

@ -11,6 +11,8 @@
<div class="card-body">
<form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
<h1>Add Source</h1>
<p><em>Google Alert</em> - For scraping the web using google alerts. Enter the search term in the <em>title</em> and the RSS feed url in the <em>link</em></p>
<p><em>Crawler Startpoint</em> - A url that the crawler will use as a start point. Enter the url in the <em>link</em>. Enter the keywords you want the crawler to hone in on in the <em>title</em>, separated by commas (eg. apple, oranges, bananas). </p>
<label>
title
<input type="text" name="title">