rss-feedmonitor/scripts/validate_alerts.py

379 lines
14 KiB
Python

#!/usr/bin/env python3
"""Validate Google Alert query blocks and generate working replacements."""
from __future__ import annotations
import argparse
import dataclasses
import json
import re
from pathlib import Path
from typing import List, Optional, Tuple
ALERT_NAME_RE = re.compile(r"`([^`]+)`")
HEADING_RE = re.compile(r"^(#{3,})\s+(.*)")
SITE_RE = re.compile(r"site:[^\s)]+", re.IGNORECASE)
OR_RE = re.compile(r"\bOR\b", re.IGNORECASE)
QUOTE_RE = re.compile(r'"([^"]+)"')
NEGATIVE_TOKEN_RE = re.compile(r"(?:^|\s)-(?!\s)([^\s]+)")
# Regional groupings for Canadian subreddits
REDDIT_REGIONS = {
"Ontario-GTA": ["r/kitchener", "r/waterloo", "r/CambridgeON", "r/guelph", "r/toronto", "r/mississauga", "r/brampton"],
"Ontario-Other": ["r/ontario", "r/londonontario", "r/HamiltonOntario", "r/niagara", "r/ottawa"],
"Western": ["r/vancouver", "r/VictoriaBC", "r/Calgary", "r/Edmonton"],
"Prairies": ["r/saskatoon", "r/regina", "r/winnipeg"],
"Eastern": ["r/montreal", "r/quebeccity", "r/halifax", "r/newfoundland"],
}
@dataclasses.dataclass
class AlertBlock:
heading: str
alert_name: str
purpose: Optional[str]
target: Optional[str]
query: str
start_line: int
@dataclasses.dataclass
class Finding:
rule: str
severity: str
message: str
suggestion: str
@dataclasses.dataclass
class Analysis:
alert: AlertBlock
metrics: dict
findings: List[Finding]
fixed_queries: List[Tuple[str, str]] # [(alert_name, query)]
def parse_alerts(markdown_path: Path) -> List[AlertBlock]:
text = markdown_path.read_text(encoding="utf-8")
lines = text.splitlines()
alerts: List[AlertBlock] = []
current_heading = ""
pending: Optional[dict] = None
code_lines: List[str] = []
collecting_code = False
for idx, raw_line in enumerate(lines, start=1):
line = raw_line.rstrip("\n")
heading_match = HEADING_RE.match(line)
if heading_match:
hashes, heading_text = heading_match.groups()
if len(hashes) >= 3: # only capture tertiary sections
current_heading = heading_text.strip()
if line.startswith("**Alert Name:**"):
match = ALERT_NAME_RE.search(line)
alert_name = match.group(1).strip() if match else line.split("**Alert Name:**", 1)[1].strip()
pending = {
"heading": current_heading,
"alert_name": alert_name,
"purpose": None,
"target": None,
"query": None,
"start_line": idx,
}
continue
if pending:
if line.startswith("**Purpose:**"):
pending["purpose"] = line.split("**Purpose:**", 1)[1].strip()
continue
if line.startswith("**Target:**"):
pending["target"] = line.split("**Target:**", 1)[1].strip()
continue
if line.strip() == "```":
if not pending:
# ignore code blocks unrelated to alerts
collecting_code = False
code_lines = []
continue
if not collecting_code:
collecting_code = True
code_lines = []
else:
collecting_code = False
query_text = "\n".join(code_lines).strip()
alert_block = AlertBlock(
heading=pending["heading"],
alert_name=pending["alert_name"],
purpose=pending["purpose"],
target=pending["target"],
query=query_text,
start_line=pending["start_line"],
)
alerts.append(alert_block)
pending = None
code_lines = []
continue
if collecting_code:
code_lines.append(line)
return alerts
def extract_query_parts(query: str) -> Tuple[List[str], List[str], List[str]]:
"""Extract site filters, keywords, and exclusions from query."""
sites = SITE_RE.findall(query)
# Extract all quoted phrases first (these are the keywords)
all_keywords = QUOTE_RE.findall(query)
# Filter out ALERT_NAME markers
keywords = [kw for kw in all_keywords if not kw.startswith("ALERT_NAME:")]
# Find exclusions (negative terms)
exclusions = []
for match in NEGATIVE_TOKEN_RE.finditer(query):
term = match.group(1)
# Skip if it's part of quoted text
if '"' not in match.group(0):
exclusions.append(term)
return sites, keywords, exclusions
def generate_fixed_queries(alert: AlertBlock, findings: List[Finding]) -> List[Tuple[str, str]]:
"""Generate working replacement queries when issues are found."""
if not findings or not any(f.severity == "high" for f in findings):
return []
sites, keywords, exclusions = extract_query_parts(alert.query)
fixed = []
# Check if this is a Reddit alert with too many sites
is_reddit = any("reddit.com" in s for s in sites)
has_site_issue = any(f.rule == "site-filter-limit" for f in findings)
has_term_issue = any(f.rule == "term-limit" for f in findings)
if is_reddit and has_site_issue:
# Split by region
for region_name, subreddits in REDDIT_REGIONS.items():
# Limit keywords to top 10-12 most specific ones
top_keywords = keywords[:12] if has_term_issue else keywords[:18]
site_part = " OR ".join([f"site:reddit.com/{sub}" for sub in subreddits])
keyword_part = " OR ".join([f'"{kw}"' for kw in top_keywords])
exclusion_part = " ".join([f"-{ex}" for ex in exclusions[:4]]) # Limit exclusions
fixed_query = f"({site_part})\n({keyword_part})\n{exclusion_part}".strip()
# Verify it meets limits
test_metrics = {
"site_filters": len(subreddits),
"approx_terms": len(top_keywords),
"char_length": len(fixed_query),
}
if test_metrics["site_filters"] <= 8 and test_metrics["approx_terms"] <= 18 and test_metrics["char_length"] <= 500:
new_name = f"{alert.alert_name.replace(' - Reddit CA', '')} - {region_name}"
fixed.append((new_name, fixed_query))
elif has_term_issue and not is_reddit:
# For non-Reddit, just trim keywords
top_keywords = keywords[:15]
site_part = " OR ".join(sites)
keyword_part = " OR ".join([f'"{kw}"' for kw in top_keywords])
exclusion_part = " ".join([f"-{ex}" for ex in exclusions[:4]])
if site_part:
fixed_query = f"({site_part})\n({keyword_part})\n{exclusion_part}".strip()
else:
fixed_query = f"({keyword_part})\n{exclusion_part}".strip()
if len(fixed_query) <= 500:
fixed.append((alert.alert_name + " (Fixed)", fixed_query))
return fixed
def evaluate(alert: AlertBlock) -> Analysis:
query = alert.query
normalized = " ".join(query.split())
site_filters = SITE_RE.findall(query)
or_count = len(OR_RE.findall(query))
approx_terms = or_count + 1
quoted_phrases = len(QUOTE_RE.findall(query))
negative_tokens = len(NEGATIVE_TOKEN_RE.findall(query))
char_length = len(normalized)
lines = query.count("\n") + 1
metrics = {
"site_filters": len(site_filters),
"or_operators": or_count,
"approx_terms": approx_terms,
"quoted_phrases": quoted_phrases,
"negative_tokens": negative_tokens,
"char_length": char_length,
"line_count": lines,
}
findings: List[Finding] = []
if metrics["site_filters"] > 12:
findings.append(Finding(
rule="site-filter-limit",
severity="high",
message=f"Contains {metrics['site_filters']} site filters, which usually exceeds Google Alerts reliability.",
suggestion="Split geography into multiple alerts with fewer site: clauses each.",
))
if metrics["approx_terms"] > 28:
findings.append(Finding(
rule="term-limit",
severity="high",
message=f"Approx {metrics['approx_terms']} OR terms detected (>{28}).",
suggestion="Break the keyword block into two alerts or remove low-value phrases.",
))
if metrics["quoted_phrases"] > 12:
findings.append(Finding(
rule="quoted-phrases",
severity="medium",
message=f"Uses {metrics['quoted_phrases']} exact-phrase matches, reducing match surface.",
suggestion="Convert some exact phrases into (keyword AND variant) pairs to widen matches.",
))
if metrics["char_length"] > 600:
findings.append(Finding(
rule="length",
severity="medium",
message=f"Query is {metrics['char_length']} characters long (Google truncates beyond ~512).",
suggestion="Remove redundant OR terms or shorten site filter lists.",
))
if metrics["negative_tokens"] > 8:
findings.append(Finding(
rule="exclusion-limit",
severity="low",
message=f"Contains {metrics['negative_tokens']} negative filters; excess exclusions may hide valid leads.",
suggestion="Keep only the highest noise sources (e.g., -job -jobs).",
))
if metrics["line_count"] > 3:
findings.append(Finding(
rule="multiline",
severity="low",
message="Query spans more than three lines, which often indicates chained filters beyond alert limits.",
suggestion="Condense by running separate alerts per platform or intent.",
))
fixed_queries = generate_fixed_queries(alert, findings)
return Analysis(alert=alert, metrics=metrics, findings=findings, fixed_queries=fixed_queries)
def format_markdown(analyses: List[Analysis]) -> str:
lines: List[str] = []
for analysis in analyses:
alert = analysis.alert
lines.append(f"### {alert.alert_name}")
heading = alert.heading or "(No heading)"
lines.append(f"Section: {heading}")
lines.append(f"Start line: {alert.start_line}")
metric_parts = [f"site:{analysis.metrics['site_filters']}",
f"ORs:{analysis.metrics['or_operators']}",
f"phrases:{analysis.metrics['quoted_phrases']}",
f"len:{analysis.metrics['char_length']}"]
lines.append("Metrics: " + ", ".join(metric_parts))
if analysis.findings:
lines.append("Findings:")
for finding in analysis.findings:
lines.append(f"- ({finding.severity}) {finding.message} Suggestion: {finding.suggestion}")
else:
lines.append("Findings: None detected by heuristics.")
lines.append("")
return "\n".join(lines).strip() + "\n"
def generate_fixed_markdown(analyses: List[Analysis]) -> str:
"""Generate new markdown with working queries."""
lines = ["# Google Alert Queries - Working Versions", "",
"These queries have been validated to work within Google Alerts limits.",
"Each query stays under 500 chars, uses ≤8 site filters, and ≤18 OR terms.", ""]
for analysis in analyses:
alert = analysis.alert
if analysis.fixed_queries:
# Use fixed versions
for new_name, new_query in analysis.fixed_queries:
lines.append(f"## {new_name}")
if alert.purpose:
lines.append(f"**Purpose:** {alert.purpose}")
if alert.target:
lines.append(f"**Target:** {alert.target}")
lines.append("")
lines.append("```")
lines.append(new_query)
lines.append("```")
lines.append("")
elif not any(f.severity == "high" for f in analysis.findings):
# Query is already OK, keep it
lines.append(f"## {alert.alert_name}")
if alert.purpose:
lines.append(f"**Purpose:** {alert.purpose}")
if alert.target:
lines.append(f"**Target:** {alert.target}")
lines.append("")
lines.append("```")
lines.append(alert.query)
lines.append("```")
lines.append("")
return "\n".join(lines)
def run(markdown_path: Path, output_format: str, fix_mode: bool) -> None:
alerts = parse_alerts(markdown_path)
analyses = [evaluate(alert) for alert in alerts]
if fix_mode:
print(generate_fixed_markdown(analyses))
elif output_format == "json":
payload = [
{
"alert_name": analysis.alert.alert_name,
"heading": analysis.alert.heading,
"start_line": analysis.alert.start_line,
"metrics": analysis.metrics,
"findings": [dataclasses.asdict(f) for f in analysis.findings],
"fixed_count": len(analysis.fixed_queries),
}
for analysis in analyses
]
print(json.dumps(payload, indent=2))
else:
print(format_markdown(analyses))
def main() -> None:
parser = argparse.ArgumentParser(description="Validate Google Alert queries and generate working replacements.")
parser.add_argument("markdown", nargs="?", default="docs/google-alerts.md", help="Path to the markdown file containing alerts.")
parser.add_argument("--format", choices=["markdown", "json"], default="markdown")
parser.add_argument("--fix", action="store_true", help="Generate fixed/working queries")
args = parser.parse_args()
markdown_path = Path(args.markdown)
if not markdown_path.exists():
raise SystemExit(f"File not found: {markdown_path}")
run(markdown_path, args.format, args.fix)
if __name__ == "__main__":
main()