Dorks crawler fetches rss feeds from db
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
6fc12d50cb
commit
58e2d763c6
|
@ -12,7 +12,6 @@ from clean_string import clean_string
|
|||
import xml.etree.ElementTree as ET
|
||||
from playwright.async_api import async_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
from seed_with_csv import seed_with_csv
|
||||
|
||||
@dataclass
|
||||
class Alert:
|
||||
|
@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]:
|
|||
|
||||
return alerts
|
||||
|
||||
def get_feeds() -> list[tuple[str, str]]:
|
||||
|
||||
def get_feeds_from_db() -> list[tuple[str, str]]:
|
||||
"""
|
||||
Returns:
|
||||
list[tuple[str, str]]: A list of tuples, where each tuple
|
||||
contains a feed's name and its URL.
|
||||
"""
|
||||
|
||||
print("fetching feeds from db...")
|
||||
|
||||
response = requests.get("https://ploughshares.nixc.us/api/sources")
|
||||
|
||||
sources = response.json()
|
||||
|
||||
alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)
|
||||
|
||||
result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_feeds_from_csv() -> list[tuple[str, str]]:
|
||||
"""Reads feed names and URLs from a local CSV file.
|
||||
|
||||
This function opens 'feeds.csv', which is expected to be in the
|
||||
|
@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
|
|||
List[Dict[str, str]]: A list of dictionaries, where each dictionary
|
||||
contains the 'url' and its cleaned 'content'.
|
||||
"""
|
||||
feeds: List[Tuple[str, str]] = get_feeds()
|
||||
#feeds: List[Tuple[str, str]] = get_feeds_from_csv()
|
||||
feeds: List[Tuple[str, str]] = get_feeds_from_db()
|
||||
urls: List[str] = []
|
||||
|
||||
for keyword, feed in feeds:
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
<div class="card-body">
|
||||
<form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
|
||||
<h1>Add Source</h1>
|
||||
<p><em>Google Alert</em> - For scraping the web using google alerts. Enter the search term in the <em>title</em> and the RSS feed url in the <em>link</em></p>
|
||||
<p><em>Crawler Startpoint</em> - A url that the crawler will use as a start point. Enter the url in the <em>link</em>. Enter the keywords you want the crawler to hone in on in the <em>title</em>, separated by commas (eg. apple, oranges, bananas). </p>
|
||||
<label>
|
||||
title
|
||||
<input type="text" name="title">
|
||||
|
|
Loading…
Reference in New Issue