rss-feedmonitor/extract_website_keywords.py

227 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Extract and consolidate keywords from motherboardrepair.ca website
This script reads the sst.yml and keywords.csv files from the motherboard repair
website and extracts all unique repair-related keywords for our RSS feed generator.
"""
import yaml
import csv
import json
from pathlib import Path
def extract_keywords_from_sst():
"""Extract keywords from sst.yml file"""
sst_path = Path("../motherboardrepair.ca/sst.yml")
with open(sst_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
keywords = set()
# Extract global keywords
if 'global' in data and 'keywords' in data['global']:
global_keywords = data['global']['keywords']
for category in ['primary', 'secondary']:
if category in global_keywords:
keywords.update(global_keywords[category])
# Extract page-specific keywords
if 'pages' in data:
for page, config in data['pages'].items():
if 'keywords' in config:
page_keywords = config['keywords']
for category in ['primary', 'secondary']:
if category in page_keywords:
# Handle both string and list formats
kw_list = page_keywords[category]
if isinstance(kw_list, str):
keywords.add(kw_list)
elif isinstance(kw_list, list):
keywords.update(kw_list)
return keywords
def extract_keywords_from_csv():
"""Extract keywords from keywords.csv file"""
csv_path = Path("../motherboardrepair.ca/keywords.csv")
keywords = set()
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
keywords.update(row)
return keywords
def categorize_keywords(keywords):
"""Categorize keywords into logical groups for RSS feeds"""
categories = {
"iphone_repairs": {
"name": "iPhone Repair Requests",
"description": "Most common iPhone repair requests",
"devices": ["iPhone", "iPhone 12", "iPhone 13", "iPhone 14", "iPhone 15"],
"problems": []
},
"macbook_repairs": {
"name": "MacBook Repair Requests",
"description": "MacBook hardware repair needs",
"devices": ["MacBook", "MacBook Pro", "MacBook Air"],
"problems": []
},
"ipad_repairs": {
"name": "iPad Repair Requests",
"description": "iPad repair and maintenance",
"devices": ["iPad", "iPad Pro", "iPad Air", "iPad mini"],
"problems": []
},
"laptop_repairs": {
"name": "Laptop Repair Requests",
"description": "General laptop repair discussions",
"devices": ["laptop", "computer", "notebook"],
"problems": []
},
"android_repairs": {
"name": "Android Device Repairs",
"description": "Android/Samsung device repair needs",
"devices": ["Samsung", "Samsung Galaxy", "Galaxy", "Android"],
"problems": []
},
"console_repairs": {
"name": "Gaming Console Repairs",
"description": "Console repair and maintenance",
"devices": ["PS5", "PS4", "Xbox", "Nintendo Switch", "PlayStation"],
"problems": []
},
"gpu_repairs": {
"name": "GPU/Graphics Card Repairs",
"description": "Graphics card and GPU repair needs",
"devices": ["GPU", "graphics card", "RTX", "GTX", "NVIDIA"],
"problems": []
},
"data_recovery": {
"name": "Data Recovery Requests",
"description": "Data recovery and storage repair",
"devices": [],
"problems": []
},
"general_repairs": {
"name": "General Repair Services",
"description": "General repair service requests",
"devices": [],
"problems": []
}
}
# Keywords that indicate problems/symptoms
problem_indicators = [
"repair", "fix", "broken", "not working", "dead", "no power",
"won't turn on", "won't boot", "crashed", "frozen", "slow",
"won't charge", "charging port", "screen broken", "cracked screen",
"water damage", "liquid damage", "spilled", "dropped",
"overheating", "loud fan", "not starting", "blue screen",
"kernel panic", "boot loop", "black screen", "no display",
"won't connect", "connection issues", "WiFi problems",
"speakers not working", "microphone broken", "camera not working",
"keyboard not working", "touchpad issues", "battery dead",
"hard drive failed", "SSD dead", "storage failed",
"data recovery", "lost files", "recover data",
"looking for repair", "need repair", "repair shop", "repair service",
"professional repair", "local repair"
]
# Device-specific keywords mapping
device_mappings = {
"iphone_repairs": ["iphone"],
"macbook_repairs": ["macbook"],
"ipad_repairs": ["ipad"],
"laptop_repairs": ["laptop", "computer", "notebook"],
"android_repairs": ["samsung", "galaxy", "android"],
"console_repairs": ["ps5", "ps4", "xbox", "nintendo switch", "playstation"],
"gpu_repairs": ["gpu", "graphics card", "nvidia", "rtx", "gtx"],
}
# Categorize each keyword
for keyword in keywords:
keyword_lower = keyword.lower().strip()
# Check if it's a device keyword
categorized = False
for category, device_terms in device_mappings.items():
if any(device in keyword_lower for device in device_terms):
categories[category]["problems"].append(keyword)
categorized = True
break
# Check if it's a problem keyword
if not categorized:
for category_name, category_data in categories.items():
if category_name in ["data_recovery", "general_repairs"]:
if any(problem in keyword_lower for problem in problem_indicators):
categories[category_name]["problems"].append(keyword)
categorized = True
break
# If not categorized, add to general repairs
if not categorized and any(problem in keyword_lower for problem in problem_indicators):
categories["general_repairs"]["problems"].append(keyword)
# Remove duplicates and sort
for category_data in categories.values():
category_data["problems"] = sorted(list(set(category_data["problems"])))
return categories
def create_updated_keywords_file(categories):
"""Create the updated repair_keywords.json file"""
# Create the structure expected by our RSS generator
keywords_data = {
"description": "Comprehensive repair keywords extracted from motherboardrepair.ca website",
"version": "2.0",
"source": "motherboardrepair.ca sst.yml and keywords.csv",
"last_updated": "2026-01-19",
"categories": categories,
"additional_keywords": {
"urgency_indicators": ["emergency", "urgent", "help needed", "asap", "quick", "fast"],
"location_indicators": ["local", "near me", "in my area", "downtown", "nearby"],
"service_types": ["diagnostics", "diagnostic", "troubleshooting", "microsolder", "component repair", "board repair"]
}
}
return keywords_data
def main():
print("🔄 Extracting keywords from motherboardrepair.ca...")
# Extract keywords from both sources
sst_keywords = extract_keywords_from_sst()
csv_keywords = extract_keywords_from_csv()
all_keywords = sst_keywords.union(csv_keywords)
print(f"✅ Found {len(all_keywords)} unique keywords")
# Categorize keywords
categories = categorize_keywords(all_keywords)
print("✅ Categorized keywords into repair types")
# Create updated keywords file
keywords_data = create_updated_keywords_file(categories)
# Save to our data directory
output_path = Path("data/repair_keywords.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(keywords_data, f, indent=2, ensure_ascii=False)
print(f"✅ Updated {output_path} with website-extracted keywords")
# Summary
total_categorized = sum(len(cat["problems"]) for cat in categories.values())
print("\n📊 SUMMARY:")
print(f" - Source keywords: {len(all_keywords)}")
print(f" - Categorized keywords: {total_categorized}")
print(f" - Categories: {len(categories)}")
print(" - Ready for RSS feed generation!")
if __name__ == "__main__":
main()