feat(report): reportSummary, topExternalDomains, broken samples/domain counts; docs: schema; chore: prune TODO scope

This commit is contained in:
colin 2025-08-31 09:55:50 -04:00
parent 3eb9ab48bf
commit 89adad8ad8
3 changed files with 83 additions and 26 deletions

37
TODO.md
View File

@ -7,39 +7,24 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
- [x] Normalize and dedupe trailing `/.` URL variants in output
- [ ] Add compact `reportSummary` text block to JSON
- [ ] Top external domains with counts
- [ ] Broken links sample (first N) + per-domain broken counts
- [x] Add compact `reportSummary` text block to JSON
- [x] Top external domains with counts
- [x] Broken links sample (first N) + per-domain broken counts
### Moderate scope
- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
### Core additions (default, no flags)
- [ ] Robots.txt summary (present, fetchedAt)
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
- [ ] Per-page response time (responseTimeMs) and content length
- [ ] Basic page metadata: `<title>`, canonical (if present)
- [ ] Per-page response time (responseTimeMs) and content length (basic)
- [ ] Basic page metadata: `<title>`
- [ ] Depth distribution (count of pages by depth)
- [ ] Duplicate title/canonical detection (lists of URLs)
### Content/asset analysis
- [ ] Extract assets (images/css/js) per page with status/type/size
- [ ] Mixed-content detection (http assets on https pages)
- [ ] Image accessibility metric (alt present ratio)
### Security and quality signals
- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
- [ ] Insecure forms (http action on https page)
- [ ] Large pages and slow pages (p95 thresholds) summary
### Link behavior and graph
- [ ] Redirect map (from → to, hops; count summary)
- [ ] Indegree/outdegree stats; small graph summary
- [ ] Redirect map summary (from → to domain counts)
### Outputs and UX
- [ ] CSV exports: pages.csv, links.csv, assets.csv
- [ ] CSV exports: pages.csv, links.csv
- [ ] NDJSON export option for streaming pipelines
- [ ] Optional: include file/line anchors in JSON for large outputs
### Notes
- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
- All report metrics must be gathered by default with zero flags required.
- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change.

View File

@ -3,7 +3,9 @@ package report
import (
"fmt"
"io"
"net/url"
"sort"
"strings"
"urlcrawler/internal/linkcheck"
)
@ -21,6 +23,10 @@ type Report struct {
Metadata Metadata `json:"metadata"`
Params Params `json:"params"`
Stats Stats `json:"stats"`
ReportSummary string `json:"reportSummary,omitempty"`
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
}
type Metadata struct {
@ -47,6 +53,11 @@ type Stats struct {
StatusOther int `json:"statusOther"`
}
type DomainCount struct {
Domain string `json:"domain"`
Count int `json:"count"`
}
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap))
@ -92,6 +103,29 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
}
}
// Derived summaries
tHost := hostLower(target)
extCounts := map[string]int{}
brokenByDomain := map[string]int{}
var brokenSample []linkcheck.LinkStatus
for _, ls := range check.Statuses {
h := hostLower(ls.URL)
if h != "" && !strings.EqualFold(h, tHost) {
extCounts[h]++
}
if !ls.OK {
brokenByDomain[h]++
if len(brokenSample) < 10 {
brokenSample = append(brokenSample, ls)
}
}
}
topExternal := mapToSortedSlice(extCounts)
brokenBy := mapToSortedSlice(brokenByDomain)
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
return Report{
Target: target,
CrawledURLs: crawledList,
@ -105,6 +139,10 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
Metadata: meta,
Params: params,
Stats: st,
ReportSummary: summary,
TopExternalDomains: topExternal,
BrokenSample: brokenSample,
BrokenByDomain: brokenBy,
}
}
@ -163,3 +201,31 @@ func sanitizeURL(u string) string {
}
return u
}
func hostLower(raw string) string {
u, err := url.Parse(raw)
if err != nil {
return ""
}
return strings.ToLower(u.Host)
}
func mapToSortedSlice(m map[string]int) []DomainCount {
if len(m) == 0 {
return nil
}
out := make([]DomainCount, 0, len(m))
for k, v := range m {
out = append(out, DomainCount{Domain: k, Count: v})
}
sort.Slice(out, func(i, j int) bool {
if out[i].Count == out[j].Count {
return out[i].Domain < out[j].Domain
}
return out[i].Count > out[j].Count
})
if len(out) > 10 {
out = out[:10]
}
return out
}

View File

@ -91,9 +91,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
- **status5xx** (number)
- **statusOther** (number)
- **reportSummary** (string): Compact summary string like `crawled=7 sitemap=7 links=26 ok=26 broken=0`.
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
### Notes
- URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output.
- All metrics described here are included by default; no extra flags are required.
- Content-type filtering: only `text/html` pages are parsed for outlinks.
- Sitemap fetching is best-effort; absence is not treated as an error.
- The JSON lists are sorted to produce stable outputs across runs.