ploughshares/docker/crawler-google-alerts/get_all_feed_contents.py

import asyncio
from typing import Dict, List, Tuple
from clean_string import clean_string
from fetch_site import fetch_site
from get_feeds import get_feeds
from get_links_from_feed import Alert, get_links_from_feed
import xml.etree.ElementTree as ET


async def get_all_feed_contents() -> List[Dict[str, str]]:
    """
    Asynchronously fetches and processes content from multiple RSS feeds.

    This function first gets a list of RSS feeds, extracts all article URLs from them,
    and then asynchronously fetches the content of each URL. The content is cleaned
    and returned as a list of dictionaries.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, where each dictionary
                               contains the 'url' and its cleaned 'content'.
    """
    feeds: List[Tuple[str, str]] = get_feeds()
    urls: List[str] = []

    for keyword, feed in feeds:
        alerts: List[Alert] = get_links_from_feed(feed)
        for alert in alerts:
            urls.append(alert.url)
        print(f"{len(alerts)} links found for '{keyword}'")

    print(f"\n{len(urls)} total links found. Starting fetch process.")
    pages: List[Dict[str, str]] = []

    # Create a list of tasks to run concurrently
    tasks = [fetch_site(url) for url in urls]
    results = await asyncio.gather(*tasks)

    for url, content in zip(urls, results):
        if content:
            pages.append({
                "url": url,
                "content": clean_string(content)
            })

    print(f"\nSuccessfully fetched {len(pages)} webpages.")
    return pages

def write_to_xml(pages: List[Dict[str, str]], filename: str) -> None:
    """
    Writes a list of page data to an XML file.

    The XML structure will be:
    <pages>
        <page>
            <url>http://...</url>
            <content>...</content>
        </page>
        ...
    </pages>

    Args:
        pages (List[Dict[str, str]]): The list of page data to write.
        filename (str): The name of the output XML file.
    """
    root = ET.Element("pages")

    for page_data in pages:
        page_element = ET.SubElement(root, "page")
        url_element = ET.SubElement(page_element, "url")
        url_element.text = page_data.get("url")
        content_element = ET.SubElement(page_element, "content")
        content_element.text = page_data.get("content")

    tree = ET.ElementTree(root)
    # The 'xml_declaration' and 'encoding' arguments ensure it's a well-formed XML file.
    tree.write(filename, encoding='utf-8', xml_declaration=True)
    print(f"Data successfully written to {filename}")


async def main() -> None:
    """
    Main entry point for the script.
    """
    all_pages = await get_all_feed_contents()
    if all_pages:
        write_to_xml(all_pages, "feed_contents.xml")
    else:
        print("No pages were fetched. XML file not created.")


if __name__ == "__main__":
    asyncio.run(main())