321 lines
13 KiB
Python
321 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Modular RSS Feed Generator for Canadian Repair Monitoring
|
|
|
|
This script reads from source files (keywords.json, subreddits.json) to generate
|
|
both Markdown and OPML output files for consistent, repeatable RSS feed generation.
|
|
|
|
Features:
|
|
- Reads from external source files for easy maintenance
|
|
- Generates clean Markdown output with RSS URLs
|
|
- Creates OPML files for easy RSS reader import
|
|
- Modular design for future updates and expansion
|
|
"""
|
|
|
|
import json
|
|
import urllib.parse
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
|
|
class ModularRSSGenerator:
|
|
def __init__(self, keywords_file="../data/repair_keywords.json", subreddits_file="../data/canadian_subreddits.json"):
|
|
self.base_search_url = "https://www.reddit.com/r/{}/search.rss?q={}&sort=new&type=link"
|
|
self.keywords_file = Path(keywords_file)
|
|
self.subreddits_file = Path(subreddits_file)
|
|
|
|
# Load source data
|
|
self.keywords = self.load_keywords()
|
|
self.subreddits = self.load_subreddits()
|
|
|
|
# Output files
|
|
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
self.markdown_file = f"../feeds/rss_feeds_{self.timestamp}.md"
|
|
self.opml_file = f"../feeds/rss_feeds_{self.timestamp}.opml"
|
|
|
|
def load_keywords(self) -> Dict[str, Any]:
|
|
"""Load keywords from JSON source file"""
|
|
try:
|
|
with open(self.keywords_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(f"Keywords file not found: {self.keywords_file}")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in keywords file: {e}")
|
|
|
|
def load_subreddits(self) -> Dict[str, Any]:
|
|
"""Load subreddits from JSON source file"""
|
|
try:
|
|
with open(self.subreddits_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
raise FileNotFoundError(f"Subreddits file not found: {self.subreddits_file}")
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in subreddits file: {e}")
|
|
|
|
def build_search_query(self, category_data: Dict[str, Any]) -> str:
|
|
"""Build Reddit search query from category data"""
|
|
devices = category_data.get("devices", [])
|
|
problems = category_data.get("problems", [])
|
|
|
|
if not devices and not problems:
|
|
# For categories like data recovery that don't specify devices
|
|
return " OR ".join(f'"{problem}"' for problem in problems)
|
|
|
|
# Build device OR clause
|
|
device_clause = " OR ".join(f'"{device}"' for device in devices) if devices else ""
|
|
|
|
# Build problem OR clause
|
|
problem_clause = " OR ".join(f'"{problem}"' for problem in problems) if problems else ""
|
|
|
|
# Combine with AND
|
|
if device_clause and problem_clause:
|
|
return f"({device_clause}) AND ({problem_clause})"
|
|
elif device_clause:
|
|
return device_clause
|
|
elif problem_clause:
|
|
return problem_clause
|
|
else:
|
|
return ""
|
|
|
|
def generate_feed_data(self) -> List[Dict[str, Any]]:
|
|
"""Generate RSS feed data for all combinations"""
|
|
feeds = []
|
|
|
|
for priority, priority_data in self.subreddits["priorities"].items():
|
|
for subreddit_data in priority_data["subreddits"]:
|
|
subreddit_name = subreddit_data["name"]
|
|
|
|
for category_key, category_data in self.keywords["categories"].items():
|
|
search_query = self.build_search_query(category_data)
|
|
|
|
if not search_query:
|
|
continue # Skip empty queries
|
|
|
|
encoded_query = urllib.parse.quote(search_query)
|
|
rss_url = self.base_search_url.format(subreddit_name, encoded_query)
|
|
|
|
feed_data = {
|
|
"priority": priority,
|
|
"priority_score": subreddit_data.get("priority_score", 0),
|
|
"subreddit": subreddit_name,
|
|
"subreddit_full": subreddit_data["full_name"],
|
|
"province": subreddit_data["province"],
|
|
"population": subreddit_data["population"],
|
|
"category": category_key,
|
|
"category_name": category_data["name"],
|
|
"description": category_data["description"],
|
|
"search_query": search_query,
|
|
"rss_url": rss_url,
|
|
"devices": category_data.get("devices", []),
|
|
"problems": category_data.get("problems", [])
|
|
}
|
|
|
|
feeds.append(feed_data)
|
|
|
|
# Sort by priority score (highest first), then by subreddit
|
|
feeds.sort(key=lambda x: (-x["priority_score"], x["subreddit"], x["category"]))
|
|
|
|
return feeds
|
|
|
|
def generate_markdown_output(self, feeds: List[Dict[str, Any]]) -> str:
|
|
"""Generate clean markdown output"""
|
|
output = []
|
|
|
|
# Header
|
|
output.append("# 📡 Canadian Repair RSS Feeds")
|
|
output.append("")
|
|
output.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
output.append(f"**Total Feeds:** {len(feeds)}")
|
|
output.append("**Strategy:** Modular generation from source files for consistent results")
|
|
output.append("")
|
|
output.append("---")
|
|
output.append("")
|
|
|
|
# Summary
|
|
output.append("## 📊 SUMMARY")
|
|
output.append("")
|
|
opml_basename = Path(self.opml_file).name
|
|
output.append(f"- **OPML File:** [{opml_basename}]({self.opml_file}) (Import into RSS readers)")
|
|
output.append(f"- **Keywords Source:** [{self.keywords_file}]({self.keywords_file})")
|
|
output.append(f"- **Subreddits Source:** [{self.subreddits_file}]({self.subreddits_file})")
|
|
output.append(f"- **Generation Script:** [generate_modular_rss_feeds.py](generate_modular_rss_feeds.py)")
|
|
output.append("")
|
|
|
|
# Group feeds by priority
|
|
current_priority = None
|
|
current_subreddit = None
|
|
|
|
for feed in feeds:
|
|
# Priority header
|
|
if feed["priority"] != current_priority:
|
|
current_priority = feed["priority"]
|
|
output.append(f"## {current_priority.upper()} PRIORITY")
|
|
output.append("")
|
|
|
|
# Subreddit header
|
|
if feed["subreddit"] != current_subreddit:
|
|
current_subreddit = feed["subreddit"]
|
|
subreddit_data = next((s for p in self.subreddits["priorities"].values()
|
|
for s in p["subreddits"] if s["name"] == current_subreddit), {})
|
|
output.append(f"### {subreddit_data.get('full_name', f'r/{current_subreddit}')}")
|
|
|
|
if "description" in subreddit_data:
|
|
output.append(f"**{subreddit_data['description']}**")
|
|
output.append(f"- **Province:** {subreddit_data.get('province', 'N/A')}")
|
|
output.append(f"- **Population:** {subreddit_data.get('population', 'N/A')}")
|
|
output.append("")
|
|
|
|
# Feed details
|
|
output.append(f"#### {feed['category_name']}")
|
|
output.append("")
|
|
output.append(f"**Category:** {feed['category'].replace('_', ' ').title()}")
|
|
output.append(f"**Description:** {feed['description']}")
|
|
output.append("")
|
|
|
|
if feed['devices']:
|
|
output.append(f"**Devices:** {', '.join(feed['devices'][:5])}{'...' if len(feed['devices']) > 5 else ''}")
|
|
if feed['problems']:
|
|
output.append(f"**Problems:** {', '.join(feed['problems'][:5])}{'...' if len(feed['problems']) > 5 else ''}")
|
|
output.append("")
|
|
|
|
output.append("**Search Query:**")
|
|
output.append(f"```\n{feed['search_query']}\n```")
|
|
output.append("")
|
|
|
|
output.append("**RSS URL:**")
|
|
output.append(f"```\n{feed['rss_url']}\n```")
|
|
output.append("")
|
|
output.append("---")
|
|
output.append("")
|
|
|
|
# Implementation guide
|
|
output.append("## 🚀 IMPLEMENTATION GUIDE")
|
|
output.append("")
|
|
output.append("### Phase 1: Start Small (Week 1)")
|
|
output.append("- Import OPML file into your RSS reader")
|
|
output.append("- Subscribe to 5-10 feeds from Toronto/Vancouver")
|
|
output.append("- Monitor daily for repair opportunities")
|
|
output.append("")
|
|
output.append("### Phase 2: Scale Up (Weeks 2-4)")
|
|
output.append("- Add Calgary, Edmonton, Montreal, Ottawa feeds")
|
|
output.append("- Total: ~40 feeds across 6 major cities")
|
|
output.append("")
|
|
output.append("### Phase 3: Full Coverage (Month 2+)")
|
|
output.append("- Add all remaining Canadian cities")
|
|
output.append("- Monitor provincial subreddits")
|
|
output.append("")
|
|
output.append("## 🔧 RSS READER SETUP")
|
|
output.append("")
|
|
output.append("### Recommended Tools:")
|
|
output.append("- **Feedly** - Web/mobile with OPML import")
|
|
output.append("- **Inoreader** - Powerful filtering and organization")
|
|
output.append("- **NetNewsWire** - Native macOS RSS reader")
|
|
output.append("")
|
|
output.append("### Organization Tips:")
|
|
output.append("- Create folders: `Priority → City → Category`")
|
|
output.append("- Set notifications for new posts")
|
|
output.append("- Use starring for follow-up opportunities")
|
|
|
|
return "\n".join(output)
|
|
|
|
def generate_opml_output(self, feeds: List[Dict[str, Any]]) -> str:
|
|
"""Generate OPML XML output for RSS reader import"""
|
|
# Create root element
|
|
opml = ET.Element("opml", version="2.0")
|
|
head = ET.SubElement(opml, "head")
|
|
title = ET.SubElement(head, "title")
|
|
title.text = "Canadian Repair RSS Feeds"
|
|
date_created = ET.SubElement(head, "dateCreated")
|
|
date_created.text = datetime.now().strftime("%a, %d %b %Y %H:%M:%S GMT")
|
|
|
|
body = ET.SubElement(opml, "body")
|
|
|
|
# Group feeds by priority
|
|
current_priority = None
|
|
current_subreddit = None
|
|
current_outline = None
|
|
current_sub_outline = None
|
|
|
|
for feed in feeds:
|
|
# Priority outline
|
|
if feed["priority"] != current_priority:
|
|
current_priority = feed["priority"]
|
|
current_outline = ET.SubElement(body, "outline",
|
|
text=f"{current_priority.upper()} PRIORITY",
|
|
title=f"{current_priority.upper()} PRIORITY")
|
|
|
|
# Subreddit outline
|
|
if feed["subreddit"] != current_subreddit:
|
|
current_subreddit = feed["subreddit"]
|
|
subreddit_data = next((s for p in self.subreddits["priorities"].values()
|
|
for s in p["subreddits"] if s["name"] == current_subreddit), {})
|
|
current_sub_outline = ET.SubElement(current_outline, "outline",
|
|
text=subreddit_data.get("full_name", f"r/{current_subreddit}"),
|
|
title=subreddit_data.get("full_name", f"r/{current_subreddit}"))
|
|
|
|
# Feed outline
|
|
ET.SubElement(current_sub_outline, "outline",
|
|
text=feed["category_name"],
|
|
title=feed["category_name"],
|
|
type="rss",
|
|
xmlUrl=feed["rss_url"],
|
|
description=feed["description"])
|
|
|
|
# Convert to string with proper formatting
|
|
rough_string = ET.tostring(opml, encoding='unicode')
|
|
reparsed = ET.fromstring(rough_string)
|
|
|
|
# Pretty print XML
|
|
from xml.dom import minidom
|
|
xml_str = minidom.parseString(ET.tostring(reparsed)).toprettyxml(indent=" ")
|
|
|
|
# Remove XML declaration and clean up
|
|
lines = xml_str.split('\n')
|
|
# Skip the XML declaration and empty lines
|
|
content_lines = [line for line in lines[1:] if line.strip()]
|
|
|
|
return '<?xml version="1.0" encoding="UTF-8"?>\n' + '\n'.join(content_lines)
|
|
|
|
def save_outputs(self):
|
|
"""Generate and save both markdown and OPML outputs"""
|
|
print("🔄 Generating RSS feeds from source files...")
|
|
|
|
# Generate feed data
|
|
feeds = self.generate_feed_data()
|
|
print(f"✅ Generated {len(feeds)} RSS feeds")
|
|
|
|
# Generate outputs
|
|
markdown_content = self.generate_markdown_output(feeds)
|
|
opml_content = self.generate_opml_output(feeds)
|
|
|
|
# Save files
|
|
with open(self.markdown_file, 'w', encoding='utf-8') as f:
|
|
f.write(markdown_content)
|
|
|
|
with open(self.opml_file, 'w', encoding='utf-8') as f:
|
|
f.write(opml_content)
|
|
|
|
print(f"✅ Saved {self.markdown_file} ({len(markdown_content)} chars)")
|
|
print(f"✅ Saved {self.opml_file} ({len(opml_content)} chars)")
|
|
print("")
|
|
print("📊 FEED SUMMARY:")
|
|
print(f" - Total feeds: {len(feeds)}")
|
|
print(f" - Source files: {self.keywords_file}, {self.subreddits_file}")
|
|
print(f" - OPML ready for RSS reader import")
|
|
print("")
|
|
print("🚀 Ready to monitor Canadian repair discussions!")
|
|
|
|
def main():
|
|
try:
|
|
generator = ModularRSSGenerator()
|
|
generator.save_outputs()
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
exit(main()) |