ploughshares/docker/crawler_dorks/get_links_from_feed.py

61 lines
2.1 KiB
Python

from dataclasses import dataclass
from bs4 import BeautifulSoup
import feedparser
import urllib.parse
@dataclass
class Alert:
"""A simple data class to hold information about a single alert."""
title: str
url: str
summary: str
def get_links_from_feed(rss_url: str) -> list[Alert]:
"""
Parses a Google Alerts RSS feed URL and extracts the data for each alert.
Args:
rss_url: The URL of the Google Alerts RSS feed.
Returns:
A list of Alert objects. Returns an empty list if the feed
cannot be parsed or is empty.
"""
alerts: list[Alert] = []
# Parse the RSS feed from the provided URL
feed = feedparser.parse(rss_url)
# Check if the feed was parsed successfully and has entries
if feed.bozo:
print(f"Error parsing feed: {feed.bozo_exception}")
return alerts
# Iterate over each entry in the feed
for entry in feed.entries:
# The title is directly available
title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
title = title_soup.get_text()
# The summary often contains HTML, so we parse it to get clean text.
summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
summary = summary_soup.get_text()
# The link is a Google redirect URL; we extract the 'url' parameter.
link = entry.link
try:
# Parse the URL to easily access its components
parsed_url = urllib.parse.urlparse(link) #type: ignore
# Get the query parameters as a dictionary
query_params = urllib.parse.parse_qs(parsed_url.query)
# The actual destination URL is in the 'url' parameter
actual_url = query_params.get('url', [None])[0]
if actual_url:
# Append an Alert object instead of a tuple
alert_obj = Alert(title=title, url=actual_url, summary=summary)
alerts.append(alert_obj)
except Exception as e:
print(f"Could not parse URL for entry '{title}': {e}")
return alerts