import asyncio from typing import Dict, List, Tuple from clean_string import clean_string from fetch_site import fetch_site from get_feeds import get_feeds from get_links_from_feed import Alert, get_links_from_feed import xml.etree.ElementTree as ET async def get_all_feed_contents() -> List[Dict[str, str]]: """ Asynchronously fetches and processes content from multiple RSS feeds. This function first gets a list of RSS feeds, extracts all article URLs from them, and then asynchronously fetches the content of each URL. The content is cleaned and returned as a list of dictionaries. Returns: List[Dict[str, str]]: A list of dictionaries, where each dictionary contains the 'url' and its cleaned 'content'. """ feeds: List[Tuple[str, str]] = get_feeds() urls: List[str] = [] for keyword, feed in feeds: alerts: List[Alert] = get_links_from_feed(feed) for alert in alerts: urls.append(alert.url) print(f"{len(alerts)} links found for '{keyword}'") print(f"\n{len(urls)} total links found. Starting fetch process.") pages: List[Dict[str, str]] = [] # Create a list of tasks to run concurrently tasks = [fetch_site(url) for url in urls] results = await asyncio.gather(*tasks) for url, content in zip(urls, results): if content: pages.append({ "url": url, "content": clean_string(content) }) print(f"\nSuccessfully fetched {len(pages)} webpages.") return pages def write_to_xml(pages: List[Dict[str, str]], filename: str) -> None: """ Writes a list of page data to an XML file. The XML structure will be: http://... ... ... Args: pages (List[Dict[str, str]]): The list of page data to write. filename (str): The name of the output XML file. """ root = ET.Element("pages") for page_data in pages: page_element = ET.SubElement(root, "page") url_element = ET.SubElement(page_element, "url") url_element.text = page_data.get("url") content_element = ET.SubElement(page_element, "content") content_element.text = page_data.get("content") tree = ET.ElementTree(root) # The 'xml_declaration' and 'encoding' arguments ensure it's a well-formed XML file. tree.write(filename, encoding='utf-8', xml_declaration=True) print(f"Data successfully written to {filename}") async def main() -> None: """ Main entry point for the script. """ all_pages = await get_all_feed_contents() if all_pages: write_to_xml(all_pages, "feed_contents.xml") else: print("No pages were fetched. XML file not created.") if __name__ == "__main__": asyncio.run(main())