Dorks crawler fetches rss feeds from db
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
This commit is contained in:
parent
6fc12d50cb
commit
58e2d763c6
|
@ -12,7 +12,6 @@ from clean_string import clean_string
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from seed_with_csv import seed_with_csv
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Alert:
|
class Alert:
|
||||||
|
@ -71,7 +70,28 @@ def get_links_from_feed(rss_url: str) -> list[Alert]:
|
||||||
|
|
||||||
return alerts
|
return alerts
|
||||||
|
|
||||||
def get_feeds() -> list[tuple[str, str]]:
|
|
||||||
|
def get_feeds_from_db() -> list[tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
Returns:
|
||||||
|
list[tuple[str, str]]: A list of tuples, where each tuple
|
||||||
|
contains a feed's name and its URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("fetching feeds from db...")
|
||||||
|
|
||||||
|
response = requests.get("https://ploughshares.nixc.us/api/sources")
|
||||||
|
|
||||||
|
sources = response.json()
|
||||||
|
|
||||||
|
alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)
|
||||||
|
|
||||||
|
result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_feeds_from_csv() -> list[tuple[str, str]]:
|
||||||
"""Reads feed names and URLs from a local CSV file.
|
"""Reads feed names and URLs from a local CSV file.
|
||||||
|
|
||||||
This function opens 'feeds.csv', which is expected to be in the
|
This function opens 'feeds.csv', which is expected to be in the
|
||||||
|
@ -167,7 +187,8 @@ async def get_all_feed_contents() -> List[Dict[str, str]]:
|
||||||
List[Dict[str, str]]: A list of dictionaries, where each dictionary
|
List[Dict[str, str]]: A list of dictionaries, where each dictionary
|
||||||
contains the 'url' and its cleaned 'content'.
|
contains the 'url' and its cleaned 'content'.
|
||||||
"""
|
"""
|
||||||
feeds: List[Tuple[str, str]] = get_feeds()
|
#feeds: List[Tuple[str, str]] = get_feeds_from_csv()
|
||||||
|
feeds: List[Tuple[str, str]] = get_feeds_from_db()
|
||||||
urls: List[str] = []
|
urls: List[str] = []
|
||||||
|
|
||||||
for keyword, feed in feeds:
|
for keyword, feed in feeds:
|
||||||
|
|
|
@ -11,6 +11,8 @@
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
|
<form action="{{ url_for('create_source') }}" method="post" class="needs-validation" novalidate>
|
||||||
<h1>Add Source</h1>
|
<h1>Add Source</h1>
|
||||||
|
<p><em>Google Alert</em> - For scraping the web using google alerts. Enter the search term in the <em>title</em> and the RSS feed url in the <em>link</em></p>
|
||||||
|
<p><em>Crawler Startpoint</em> - A url that the crawler will use as a start point. Enter the url in the <em>link</em>. Enter the keywords you want the crawler to hone in on in the <em>title</em>, separated by commas (eg. apple, oranges, bananas). </p>
|
||||||
<label>
|
<label>
|
||||||
title
|
title
|
||||||
<input type="text" name="title">
|
<input type="text" name="title">
|
||||||
|
|
Loading…
Reference in New Issue