feat(report): reportSummary, topExternalDomains, broken samples/domain counts; docs: schema; chore: prune TODO scope
This commit is contained in:
parent
3eb9ab48bf
commit
89adad8ad8
37
TODO.md
37
TODO.md
|
@ -7,39 +7,24 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
||||||
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
|
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
|
||||||
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
|
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
|
||||||
- [x] Normalize and dedupe trailing `/.` URL variants in output
|
- [x] Normalize and dedupe trailing `/.` URL variants in output
|
||||||
- [ ] Add compact `reportSummary` text block to JSON
|
- [x] Add compact `reportSummary` text block to JSON
|
||||||
- [ ] Top external domains with counts
|
- [x] Top external domains with counts
|
||||||
- [ ] Broken links sample (first N) + per-domain broken counts
|
- [x] Broken links sample (first N) + per-domain broken counts
|
||||||
|
|
||||||
### Moderate scope
|
### Core additions (default, no flags)
|
||||||
- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
|
- [ ] Robots.txt summary (present, fetchedAt)
|
||||||
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
||||||
- [ ] Per-page response time (responseTimeMs) and content length
|
- [ ] Per-page response time (responseTimeMs) and content length (basic)
|
||||||
- [ ] Basic page metadata: `<title>`, canonical (if present)
|
- [ ] Basic page metadata: `<title>`
|
||||||
- [ ] Depth distribution (count of pages by depth)
|
- [ ] Depth distribution (count of pages by depth)
|
||||||
- [ ] Duplicate title/canonical detection (lists of URLs)
|
- [ ] Redirect map summary (from → to domain counts)
|
||||||
|
|
||||||
### Content/asset analysis
|
|
||||||
- [ ] Extract assets (images/css/js) per page with status/type/size
|
|
||||||
- [ ] Mixed-content detection (http assets on https pages)
|
|
||||||
- [ ] Image accessibility metric (alt present ratio)
|
|
||||||
|
|
||||||
### Security and quality signals
|
|
||||||
- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
|
|
||||||
- [ ] Insecure forms (http action on https page)
|
|
||||||
- [ ] Large pages and slow pages (p95 thresholds) summary
|
|
||||||
|
|
||||||
### Link behavior and graph
|
|
||||||
- [ ] Redirect map (from → to, hops; count summary)
|
|
||||||
- [ ] Indegree/outdegree stats; small graph summary
|
|
||||||
|
|
||||||
### Outputs and UX
|
### Outputs and UX
|
||||||
- [ ] CSV exports: pages.csv, links.csv, assets.csv
|
- [ ] CSV exports: pages.csv, links.csv
|
||||||
- [ ] NDJSON export option for streaming pipelines
|
- [ ] NDJSON export option for streaming pipelines
|
||||||
- [ ] Optional: include file/line anchors in JSON for large outputs
|
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
|
- All report metrics must be gathered by default with zero flags required.
|
||||||
- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
|
- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,9 @@ package report
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"net/url"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"urlcrawler/internal/linkcheck"
|
"urlcrawler/internal/linkcheck"
|
||||||
)
|
)
|
||||||
|
@ -21,6 +23,10 @@ type Report struct {
|
||||||
Metadata Metadata `json:"metadata"`
|
Metadata Metadata `json:"metadata"`
|
||||||
Params Params `json:"params"`
|
Params Params `json:"params"`
|
||||||
Stats Stats `json:"stats"`
|
Stats Stats `json:"stats"`
|
||||||
|
ReportSummary string `json:"reportSummary,omitempty"`
|
||||||
|
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
||||||
|
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
||||||
|
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Metadata struct {
|
type Metadata struct {
|
||||||
|
@ -47,6 +53,11 @@ type Stats struct {
|
||||||
StatusOther int `json:"statusOther"`
|
StatusOther int `json:"statusOther"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type DomainCount struct {
|
||||||
|
Domain string `json:"domain"`
|
||||||
|
Count int `json:"count"`
|
||||||
|
}
|
||||||
|
|
||||||
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
|
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
|
||||||
crawledList := sanitizeURLs(keys(crawled))
|
crawledList := sanitizeURLs(keys(crawled))
|
||||||
sitemapList := sanitizeURLs(keys(sitemap))
|
sitemapList := sanitizeURLs(keys(sitemap))
|
||||||
|
@ -92,6 +103,29 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Derived summaries
|
||||||
|
tHost := hostLower(target)
|
||||||
|
extCounts := map[string]int{}
|
||||||
|
brokenByDomain := map[string]int{}
|
||||||
|
var brokenSample []linkcheck.LinkStatus
|
||||||
|
for _, ls := range check.Statuses {
|
||||||
|
h := hostLower(ls.URL)
|
||||||
|
if h != "" && !strings.EqualFold(h, tHost) {
|
||||||
|
extCounts[h]++
|
||||||
|
}
|
||||||
|
if !ls.OK {
|
||||||
|
brokenByDomain[h]++
|
||||||
|
if len(brokenSample) < 10 {
|
||||||
|
brokenSample = append(brokenSample, ls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
topExternal := mapToSortedSlice(extCounts)
|
||||||
|
brokenBy := mapToSortedSlice(brokenByDomain)
|
||||||
|
|
||||||
|
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
||||||
|
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
||||||
|
|
||||||
return Report{
|
return Report{
|
||||||
Target: target,
|
Target: target,
|
||||||
CrawledURLs: crawledList,
|
CrawledURLs: crawledList,
|
||||||
|
@ -105,6 +139,10 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
||||||
Metadata: meta,
|
Metadata: meta,
|
||||||
Params: params,
|
Params: params,
|
||||||
Stats: st,
|
Stats: st,
|
||||||
|
ReportSummary: summary,
|
||||||
|
TopExternalDomains: topExternal,
|
||||||
|
BrokenSample: brokenSample,
|
||||||
|
BrokenByDomain: brokenBy,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,3 +201,31 @@ func sanitizeURL(u string) string {
|
||||||
}
|
}
|
||||||
return u
|
return u
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hostLower(raw string) string {
|
||||||
|
u, err := url.Parse(raw)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.ToLower(u.Host)
|
||||||
|
}
|
||||||
|
|
||||||
|
func mapToSortedSlice(m map[string]int) []DomainCount {
|
||||||
|
if len(m) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]DomainCount, 0, len(m))
|
||||||
|
for k, v := range m {
|
||||||
|
out = append(out, DomainCount{Domain: k, Count: v})
|
||||||
|
}
|
||||||
|
sort.Slice(out, func(i, j int) bool {
|
||||||
|
if out[i].Count == out[j].Count {
|
||||||
|
return out[i].Domain < out[j].Domain
|
||||||
|
}
|
||||||
|
return out[i].Count > out[j].Count
|
||||||
|
})
|
||||||
|
if len(out) > 10 {
|
||||||
|
out = out[:10]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
|
@ -91,9 +91,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
|
||||||
- **status5xx** (number)
|
- **status5xx** (number)
|
||||||
- **statusOther** (number)
|
- **statusOther** (number)
|
||||||
|
|
||||||
|
- **reportSummary** (string): Compact summary string like `crawled=7 sitemap=7 links=26 ok=26 broken=0`.
|
||||||
|
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
|
||||||
|
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
|
||||||
|
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
|
|
||||||
- URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output.
|
- URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output.
|
||||||
|
- All metrics described here are included by default; no extra flags are required.
|
||||||
- Content-type filtering: only `text/html` pages are parsed for outlinks.
|
- Content-type filtering: only `text/html` pages are parsed for outlinks.
|
||||||
- Sitemap fetching is best-effort; absence is not treated as an error.
|
- Sitemap fetching is best-effort; absence is not treated as an error.
|
||||||
- The JSON lists are sorted to produce stable outputs across runs.
|
- The JSON lists are sorted to produce stable outputs across runs.
|
||||||
|
|
Loading…
Reference in New Issue