ploughshares/docker/crawler-google-alerts/get_all_feed_contents.py

93 lines
2.9 KiB
Python

import asyncio
from typing import Dict, List, Tuple
from clean_string import clean_string
from fetch_site import fetch_site
from get_feeds import get_feeds
from get_links_from_feed import Alert, get_links_from_feed
import xml.etree.ElementTree as ET
async def get_all_feed_contents() -> List[Dict[str, str]]:
"""
Asynchronously fetches and processes content from multiple RSS feeds.
This function first gets a list of RSS feeds, extracts all article URLs from them,
and then asynchronously fetches the content of each URL. The content is cleaned
and returned as a list of dictionaries.
Returns:
List[Dict[str, str]]: A list of dictionaries, where each dictionary
contains the 'url' and its cleaned 'content'.
"""
feeds: List[Tuple[str, str]] = get_feeds()
urls: List[str] = []
for keyword, feed in feeds:
alerts: List[Alert] = get_links_from_feed(feed)
for alert in alerts:
urls.append(alert.url)
print(f"{len(alerts)} links found for '{keyword}'")
print(f"\n{len(urls)} total links found. Starting fetch process.")
pages: List[Dict[str, str]] = []
# Create a list of tasks to run concurrently
tasks = [fetch_site(url) for url in urls]
results = await asyncio.gather(*tasks)
for url, content in zip(urls, results):
if content:
pages.append({
"url": url,
"content": clean_string(content)
})
print(f"\nSuccessfully fetched {len(pages)} webpages.")
return pages
def write_to_xml(pages: List[Dict[str, str]], filename: str) -> None:
"""
Writes a list of page data to an XML file.
The XML structure will be:
<pages>
<page>
<url>http://...</url>
<content>...</content>
</page>
...
</pages>
Args:
pages (List[Dict[str, str]]): The list of page data to write.
filename (str): The name of the output XML file.
"""
root = ET.Element("pages")
for page_data in pages:
page_element = ET.SubElement(root, "page")
url_element = ET.SubElement(page_element, "url")
url_element.text = page_data.get("url")
content_element = ET.SubElement(page_element, "content")
content_element.text = page_data.get("content")
tree = ET.ElementTree(root)
# The 'xml_declaration' and 'encoding' arguments ensure it's a well-formed XML file.
tree.write(filename, encoding='utf-8', xml_declaration=True)
print(f"Data successfully written to {filename}")
async def main() -> None:
"""
Main entry point for the script.
"""
all_pages = await get_all_feed_contents()
if all_pages:
write_to_xml(all_pages, "feed_contents.xml")
else:
print("No pages were fetched. XML file not created.")
if __name__ == "__main__":
asyncio.run(main())