ploughshares/docker/crawler-google-alerts/get_all_feed_contents.py

import asyncio
import csv
from dataclasses import dataclass
import json
import os
from typing import Dict, List, Tuple
import feedparser
import urllib.parse

import requests
from clean_string import clean_string
import xml.etree.ElementTree as ET
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

@dataclass
class Alert:
    """A simple data class to hold information about a single alert."""
    title: str
    url: str
    summary: str

def get_links_from_feed(rss_url: str) -> list[Alert]:
    """
    Parses a Google Alerts RSS feed URL and extracts the data for each alert.

    Args:
        rss_url: The URL of the Google Alerts RSS feed.

    Returns:
        A list of Alert objects. Returns an empty list if the feed
        cannot be parsed or is empty.
    """
    alerts: list[Alert] = []
    # Parse the RSS feed from the provided URL
    feed = feedparser.parse(rss_url)

    # Check if the feed was parsed successfully and has entries
    if feed.bozo:
        print(f"Error parsing feed: {feed.bozo_exception}")
        return alerts

    # Iterate over each entry in the feed
    for entry in feed.entries:
        # The title is directly available
        title_soup = BeautifulSoup(entry.title, "html.parser") #type: ignore
        title = title_soup.get_text()

        # The summary often contains HTML, so we parse it to get clean text.
        summary_soup = BeautifulSoup(entry.summary, 'html.parser') #type: ignore
        summary = summary_soup.get_text()

        # The link is a Google redirect URL; we extract the 'url' parameter.
        link = entry.link

        try:
            # Parse the URL to easily access its components
            parsed_url = urllib.parse.urlparse(link) #type: ignore
            # Get the query parameters as a dictionary
            query_params = urllib.parse.parse_qs(parsed_url.query)
            # The actual destination URL is in the 'url' parameter
            actual_url = query_params.get('url', [None])[0]

            if actual_url:
                # Append an Alert object instead of a tuple
                alert_obj = Alert(title=title, url=actual_url, summary=summary)
                alerts.append(alert_obj)
        except Exception as e:
            print(f"Could not parse URL for entry '{title}': {e}")

    return alerts


def get_feeds_from_db() -> list[tuple[str, str]]:
    """
    Returns:
        list[tuple[str, str]]: A list of tuples, where each tuple
                               contains a feed's name and its URL.
    """

    print("fetching feeds from db...")

    response = requests.get("https://ploughshares.nixc.us/api/sources")

    sources = response.json()

    alerts = filter(lambda src: src.get("type","") == "Google Alert", sources)

    result = list(map(lambda alert: (alert.get("title", ""), alert.get("link", "")), alerts))

    return result


def get_feeds_from_csv() -> list[tuple[str, str]]:
    """Reads feed names and URLs from a local CSV file.

    This function opens 'feeds.csv', which is expected to be in the
    same directory as this script. The CSV must have two columns:
    the first for the feed name and the second for the URL.

    Returns:
        list[tuple[str, str]]: A list of tuples, where each tuple
                               contains a feed's name and its URL.
    """
    feeds = []
    file_path = os.path.join(os.path.dirname(__file__), "feeds.csv")

    with open(file_path, mode="r", newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        # If your CSV has a header row, uncomment the next line to skip it
        # next(reader, None)
        for row in reader:
            # Ensure the row has exactly two columns to avoid errors
            if len(row) == 2:
                feeds.append((row[0], row[1]))

    return feeds

async def fetch_site(url: str) -> str | None:
    """
    Fetches the main article text of a URL using Playwright and BeautifulSoup.

    Args:
        url: The URL of the website to fetch.

    Returns:
        A string containing the main text content of the page, or None on error.
    """
    print(f"fetching {url}")
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        try:
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)

            content = await page.content()
            soup = BeautifulSoup(content, 'html.parser')

            # Strategy: Find the main content container
            # First, try to find a <main> tag. If not, look for an <article> tag.
            # You can add more fallbacks based on common website structures,
            # e.g., soup.find('div', id='content')
            main_content = soup.find('main')
            if not main_content:
                main_content = soup.find('article')

            # If a main content area is found, extract text from it.
            if main_content:

                # (Optional) Remove unwanted elements like scripts or ads from within the main content
                for element in main_content(['script', 'style', 'aside']): # type: ignore
                    element.decompose()

                main_text = main_content.get_text(separator='\n', strip=True)
                main_text = clean_string(main_text)

                print(f"SUCCESSFUL FETCH: {url}")
                # .get_text() with separator and strip for cleaner output
                return main_text
            else:
                # Fallback if no specific container is found (less reliable)
                print("WARNING: No main content container found. Falling back to body.")
                if soup.body:
                    body_text = soup.body.get_text(separator='\n', strip=True)
                    body_text = clean_string(body_text)
                    print(f"SUCCESSFUL FETCH: {url}")
                    return body_text

        except Exception as e:
            print(f"FAILED FETCH: {url}")
            print(f"An error occurred: {e}")
            return None

        finally:
            await browser.close()

async def get_all_feed_contents() -> List[Dict[str, str]]:
    """
    Asynchronously fetches and processes content from multiple RSS feeds.

    This function first gets a list of RSS feeds, extracts all article URLs from them,
    and then asynchronously fetches the content of each URL. The content is cleaned
    and returned as a list of dictionaries.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, where each dictionary
                               contains the 'url' and its cleaned 'content'.
    """
    #feeds: List[Tuple[str, str]] = get_feeds_from_csv()
    feeds: List[Tuple[str, str]] = get_feeds_from_db()
    urls: List[str] = []

    for keyword, feed in feeds:
        alerts: List[Alert] = get_links_from_feed(feed)
        for alert in alerts:
            urls.append(alert.url)
        print(f"{len(alerts)} links found for '{keyword}'")

    print(f"\n{len(urls)} total links found. Starting fetch process.")
    pages: List[Dict[str, str]] = []

    # Create a list of tasks to run concurrently
    tasks = [fetch_site(url) for url in urls]
    results = await asyncio.gather(*tasks)

    for url, content in zip(urls, results):
        if content:
            pages.append({
                "url": url,
                "content": content
            })

    print(f"\nSuccessfully fetched {len(pages)} webpages.")
    with open("logs.json", "w") as f:
        json.dump({
            "urls":urls,
            "results": results
        }, f, indent = 4)
    return pages