rss-feedmonitor/scripts/generate_modular_rss_feeds.py

321 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Modular RSS Feed Generator for Canadian Repair Monitoring
This script reads from source files (keywords.json, subreddits.json) to generate
both Markdown and OPML output files for consistent, repeatable RSS feed generation.
Features:
- Reads from external source files for easy maintenance
- Generates clean Markdown output with RSS URLs
- Creates OPML files for easy RSS reader import
- Modular design for future updates and expansion
"""
import json
import urllib.parse
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any
class ModularRSSGenerator:
def __init__(self, keywords_file="../data/repair_keywords.json", subreddits_file="../data/canadian_subreddits.json"):
self.base_search_url = "https://www.reddit.com/r/{}/search.rss?q={}&sort=new&type=link"
self.keywords_file = Path(keywords_file)
self.subreddits_file = Path(subreddits_file)
# Load source data
self.keywords = self.load_keywords()
self.subreddits = self.load_subreddits()
# Output files
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.markdown_file = f"../feeds/rss_feeds_{self.timestamp}.md"
self.opml_file = f"../feeds/rss_feeds_{self.timestamp}.opml"
def load_keywords(self) -> Dict[str, Any]:
"""Load keywords from JSON source file"""
try:
with open(self.keywords_file, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Keywords file not found: {self.keywords_file}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in keywords file: {e}")
def load_subreddits(self) -> Dict[str, Any]:
"""Load subreddits from JSON source file"""
try:
with open(self.subreddits_file, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Subreddits file not found: {self.subreddits_file}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in subreddits file: {e}")
def build_search_query(self, category_data: Dict[str, Any]) -> str:
"""Build Reddit search query from category data"""
devices = category_data.get("devices", [])
problems = category_data.get("problems", [])
if not devices and not problems:
# For categories like data recovery that don't specify devices
return " OR ".join(f'"{problem}"' for problem in problems)
# Build device OR clause
device_clause = " OR ".join(f'"{device}"' for device in devices) if devices else ""
# Build problem OR clause
problem_clause = " OR ".join(f'"{problem}"' for problem in problems) if problems else ""
# Combine with AND
if device_clause and problem_clause:
return f"({device_clause}) AND ({problem_clause})"
elif device_clause:
return device_clause
elif problem_clause:
return problem_clause
else:
return ""
def generate_feed_data(self) -> List[Dict[str, Any]]:
"""Generate RSS feed data for all combinations"""
feeds = []
for priority, priority_data in self.subreddits["priorities"].items():
for subreddit_data in priority_data["subreddits"]:
subreddit_name = subreddit_data["name"]
for category_key, category_data in self.keywords["categories"].items():
search_query = self.build_search_query(category_data)
if not search_query:
continue # Skip empty queries
encoded_query = urllib.parse.quote(search_query)
rss_url = self.base_search_url.format(subreddit_name, encoded_query)
feed_data = {
"priority": priority,
"priority_score": subreddit_data.get("priority_score", 0),
"subreddit": subreddit_name,
"subreddit_full": subreddit_data["full_name"],
"province": subreddit_data["province"],
"population": subreddit_data["population"],
"category": category_key,
"category_name": category_data["name"],
"description": category_data["description"],
"search_query": search_query,
"rss_url": rss_url,
"devices": category_data.get("devices", []),
"problems": category_data.get("problems", [])
}
feeds.append(feed_data)
# Sort by priority score (highest first), then by subreddit
feeds.sort(key=lambda x: (-x["priority_score"], x["subreddit"], x["category"]))
return feeds
def generate_markdown_output(self, feeds: List[Dict[str, Any]]) -> str:
"""Generate clean markdown output"""
output = []
# Header
output.append("# 📡 Canadian Repair RSS Feeds")
output.append("")
output.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
output.append(f"**Total Feeds:** {len(feeds)}")
output.append("**Strategy:** Modular generation from source files for consistent results")
output.append("")
output.append("---")
output.append("")
# Summary
output.append("## 📊 SUMMARY")
output.append("")
opml_basename = Path(self.opml_file).name
output.append(f"- **OPML File:** [{opml_basename}]({self.opml_file}) (Import into RSS readers)")
output.append(f"- **Keywords Source:** [{self.keywords_file}]({self.keywords_file})")
output.append(f"- **Subreddits Source:** [{self.subreddits_file}]({self.subreddits_file})")
output.append(f"- **Generation Script:** [generate_modular_rss_feeds.py](generate_modular_rss_feeds.py)")
output.append("")
# Group feeds by priority
current_priority = None
current_subreddit = None
for feed in feeds:
# Priority header
if feed["priority"] != current_priority:
current_priority = feed["priority"]
output.append(f"## {current_priority.upper()} PRIORITY")
output.append("")
# Subreddit header
if feed["subreddit"] != current_subreddit:
current_subreddit = feed["subreddit"]
subreddit_data = next((s for p in self.subreddits["priorities"].values()
for s in p["subreddits"] if s["name"] == current_subreddit), {})
output.append(f"### {subreddit_data.get('full_name', f'r/{current_subreddit}')}")
if "description" in subreddit_data:
output.append(f"**{subreddit_data['description']}**")
output.append(f"- **Province:** {subreddit_data.get('province', 'N/A')}")
output.append(f"- **Population:** {subreddit_data.get('population', 'N/A')}")
output.append("")
# Feed details
output.append(f"#### {feed['category_name']}")
output.append("")
output.append(f"**Category:** {feed['category'].replace('_', ' ').title()}")
output.append(f"**Description:** {feed['description']}")
output.append("")
if feed['devices']:
output.append(f"**Devices:** {', '.join(feed['devices'][:5])}{'...' if len(feed['devices']) > 5 else ''}")
if feed['problems']:
output.append(f"**Problems:** {', '.join(feed['problems'][:5])}{'...' if len(feed['problems']) > 5 else ''}")
output.append("")
output.append("**Search Query:**")
output.append(f"```\n{feed['search_query']}\n```")
output.append("")
output.append("**RSS URL:**")
output.append(f"```\n{feed['rss_url']}\n```")
output.append("")
output.append("---")
output.append("")
# Implementation guide
output.append("## 🚀 IMPLEMENTATION GUIDE")
output.append("")
output.append("### Phase 1: Start Small (Week 1)")
output.append("- Import OPML file into your RSS reader")
output.append("- Subscribe to 5-10 feeds from Toronto/Vancouver")
output.append("- Monitor daily for repair opportunities")
output.append("")
output.append("### Phase 2: Scale Up (Weeks 2-4)")
output.append("- Add Calgary, Edmonton, Montreal, Ottawa feeds")
output.append("- Total: ~40 feeds across 6 major cities")
output.append("")
output.append("### Phase 3: Full Coverage (Month 2+)")
output.append("- Add all remaining Canadian cities")
output.append("- Monitor provincial subreddits")
output.append("")
output.append("## 🔧 RSS READER SETUP")
output.append("")
output.append("### Recommended Tools:")
output.append("- **Feedly** - Web/mobile with OPML import")
output.append("- **Inoreader** - Powerful filtering and organization")
output.append("- **NetNewsWire** - Native macOS RSS reader")
output.append("")
output.append("### Organization Tips:")
output.append("- Create folders: `Priority → City → Category`")
output.append("- Set notifications for new posts")
output.append("- Use starring for follow-up opportunities")
return "\n".join(output)
def generate_opml_output(self, feeds: List[Dict[str, Any]]) -> str:
"""Generate OPML XML output for RSS reader import"""
# Create root element
opml = ET.Element("opml", version="2.0")
head = ET.SubElement(opml, "head")
title = ET.SubElement(head, "title")
title.text = "Canadian Repair RSS Feeds"
date_created = ET.SubElement(head, "dateCreated")
date_created.text = datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT")
body = ET.SubElement(opml, "body")
# Group feeds by priority
current_priority = None
current_subreddit = None
current_outline = None
current_sub_outline = None
for feed in feeds:
# Priority outline
if feed["priority"] != current_priority:
current_priority = feed["priority"]
current_outline = ET.SubElement(body, "outline",
text=f"{current_priority.upper()} PRIORITY",
title=f"{current_priority.upper()} PRIORITY")
# Subreddit outline
if feed["subreddit"] != current_subreddit:
current_subreddit = feed["subreddit"]
subreddit_data = next((s for p in self.subreddits["priorities"].values()
for s in p["subreddits"] if s["name"] == current_subreddit), {})
current_sub_outline = ET.SubElement(current_outline, "outline",
text=subreddit_data.get("full_name", f"r/{current_subreddit}"),
title=subreddit_data.get("full_name", f"r/{current_subreddit}"))
# Feed outline
ET.SubElement(current_sub_outline, "outline",
text=feed["category_name"],
title=feed["category_name"],
type="rss",
xmlUrl=feed["rss_url"],
description=feed["description"])
# Convert to string with proper formatting
rough_string = ET.tostring(opml, encoding='unicode')
reparsed = ET.fromstring(rough_string)
# Pretty print XML
from xml.dom import minidom
xml_str = minidom.parseString(ET.tostring(reparsed)).toprettyxml(indent=" ")
# Remove XML declaration and clean up
lines = xml_str.split('\n')
# Skip the XML declaration and empty lines
content_lines = [line for line in lines[1:] if line.strip()]
return '<?xml version="1.0" encoding="UTF-8"?>\n' + '\n'.join(content_lines)
def save_outputs(self):
"""Generate and save both markdown and OPML outputs"""
print("🔄 Generating RSS feeds from source files...")
# Generate feed data
feeds = self.generate_feed_data()
print(f"✅ Generated {len(feeds)} RSS feeds")
# Generate outputs
markdown_content = self.generate_markdown_output(feeds)
opml_content = self.generate_opml_output(feeds)
# Save files
with open(self.markdown_file, 'w', encoding='utf-8') as f:
f.write(markdown_content)
with open(self.opml_file, 'w', encoding='utf-8') as f:
f.write(opml_content)
print(f"✅ Saved {self.markdown_file} ({len(markdown_content)} chars)")
print(f"✅ Saved {self.opml_file} ({len(opml_content)} chars)")
print("")
print("📊 FEED SUMMARY:")
print(f" - Total feeds: {len(feeds)}")
print(f" - Source files: {self.keywords_file}, {self.subreddits_file}")
print(f" - OPML ready for RSS reader import")
print("")
print("🚀 Ready to monitor Canadian repair discussions!")
def main():
try:
generator = ModularRSSGenerator()
generator.save_outputs()
except Exception as e:
print(f"❌ Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())