93 lines
2.9 KiB
Python
93 lines
2.9 KiB
Python
import asyncio
|
|
from typing import Dict, List, Tuple
|
|
from clean_string import clean_string
|
|
from fetch_site import fetch_site
|
|
from get_feeds import get_feeds
|
|
from get_links_from_feed import Alert, get_links_from_feed
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
async def get_all_feed_contents() -> List[Dict[str, str]]:
|
|
"""
|
|
Asynchronously fetches and processes content from multiple RSS feeds.
|
|
|
|
This function first gets a list of RSS feeds, extracts all article URLs from them,
|
|
and then asynchronously fetches the content of each URL. The content is cleaned
|
|
and returned as a list of dictionaries.
|
|
|
|
Returns:
|
|
List[Dict[str, str]]: A list of dictionaries, where each dictionary
|
|
contains the 'url' and its cleaned 'content'.
|
|
"""
|
|
feeds: List[Tuple[str, str]] = get_feeds()
|
|
urls: List[str] = []
|
|
|
|
for keyword, feed in feeds:
|
|
alerts: List[Alert] = get_links_from_feed(feed)
|
|
for alert in alerts:
|
|
urls.append(alert.url)
|
|
print(f"{len(alerts)} links found for '{keyword}'")
|
|
|
|
print(f"\n{len(urls)} total links found. Starting fetch process.")
|
|
pages: List[Dict[str, str]] = []
|
|
|
|
# Create a list of tasks to run concurrently
|
|
tasks = [fetch_site(url) for url in urls]
|
|
results = await asyncio.gather(*tasks)
|
|
|
|
for url, content in zip(urls, results):
|
|
if content:
|
|
pages.append({
|
|
"url": url,
|
|
"content": clean_string(content)
|
|
})
|
|
|
|
print(f"\nSuccessfully fetched {len(pages)} webpages.")
|
|
return pages
|
|
|
|
def write_to_xml(pages: List[Dict[str, str]], filename: str) -> None:
|
|
"""
|
|
Writes a list of page data to an XML file.
|
|
|
|
The XML structure will be:
|
|
<pages>
|
|
<page>
|
|
<url>http://...</url>
|
|
<content>...</content>
|
|
</page>
|
|
...
|
|
</pages>
|
|
|
|
Args:
|
|
pages (List[Dict[str, str]]): The list of page data to write.
|
|
filename (str): The name of the output XML file.
|
|
"""
|
|
root = ET.Element("pages")
|
|
|
|
for page_data in pages:
|
|
page_element = ET.SubElement(root, "page")
|
|
url_element = ET.SubElement(page_element, "url")
|
|
url_element.text = page_data.get("url")
|
|
content_element = ET.SubElement(page_element, "content")
|
|
content_element.text = page_data.get("content")
|
|
|
|
tree = ET.ElementTree(root)
|
|
# The 'xml_declaration' and 'encoding' arguments ensure it's a well-formed XML file.
|
|
tree.write(filename, encoding='utf-8', xml_declaration=True)
|
|
print(f"Data successfully written to {filename}")
|
|
|
|
|
|
async def main() -> None:
|
|
"""
|
|
Main entry point for the script.
|
|
"""
|
|
all_pages = await get_all_feed_contents()
|
|
if all_pages:
|
|
write_to_xml(all_pages, "feed_contents.xml")
|
|
else:
|
|
print("No pages were fetched. XML file not created.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|