ploughshares/docker/crawler_dorks/get_links_from_feed.py

from dataclasses import dataclass
from bs4 import BeautifulSoup
import feedparser
import urllib.parse

@dataclass
class Alert:
    """A simple data class to hold information about a single alert."""
    title: str
    url: str
    summary: str

def get_links_from_feed(rss_url: str) -> list[Alert]:
    """
    Parses a Google Alerts RSS feed URL and extracts the data for each alert.

    Args:
        rss_url: The URL of the Google Alerts RSS feed.

    Returns:
        A list of Alert objects. Returns an empty list if the feed
        cannot be parsed or is empty.
    """
    alerts: list[Alert] = []
    # Parse the RSS feed from the provided URL
    feed = feedparser.parse(rss_url)

    # Check if the feed was parsed successfully and has entries
    if feed.bozo:
        print(f"Error parsing feed: {feed.bozo_exception}")
        return alerts

    # Iterate over each entry in the feed
    for entry in feed.entries:
        # The title is directly available
        title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
        title = title_soup.get_text()

        # The summary often contains HTML, so we parse it to get clean text.
        summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
        summary = summary_soup.get_text()

        # The link is a Google redirect URL; we extract the 'url' parameter.
        link = entry.link

        try:
            # Parse the URL to easily access its components
            parsed_url = urllib.parse.urlparse(link) #type: ignore
            # Get the query parameters as a dictionary
            query_params = urllib.parse.parse_qs(parsed_url.query)
            # The actual destination URL is in the 'url' parameter
            actual_url = query_params.get('url', [None])[0]

            if actual_url:
                # Append an Alert object instead of a tuple
                alert_obj = Alert(title=title, url=actual_url, summary=summary)
                alerts.append(alert_obj)
        except Exception as e:
            print(f"Could not parse URL for entry '{title}': {e}")

    return alerts