diff --git a/TODO.md b/TODO.md
index c1671d8..b663944 100644
--- a/TODO.md
+++ b/TODO.md
@@ -7,39 +7,24 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
- [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly)
- [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary
- [x] Normalize and dedupe trailing `/.` URL variants in output
-- [ ] Add compact `reportSummary` text block to JSON
-- [ ] Top external domains with counts
-- [ ] Broken links sample (first N) + per-domain broken counts
+- [x] Add compact `reportSummary` text block to JSON
+- [x] Top external domains with counts
+- [x] Broken links sample (first N) + per-domain broken counts
-### Moderate scope
-- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules)
+### Core additions (default, no flags)
+- [ ] Robots.txt summary (present, fetchedAt)
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
-- [ ] Per-page response time (responseTimeMs) and content length
-- [ ] Basic page metadata: `
`, canonical (if present)
+- [ ] Per-page response time (responseTimeMs) and content length (basic)
+- [ ] Basic page metadata: ``
- [ ] Depth distribution (count of pages by depth)
-- [ ] Duplicate title/canonical detection (lists of URLs)
-
-### Content/asset analysis
-- [ ] Extract assets (images/css/js) per page with status/type/size
-- [ ] Mixed-content detection (http assets on https pages)
-- [ ] Image accessibility metric (alt present ratio)
-
-### Security and quality signals
-- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy)
-- [ ] Insecure forms (http action on https page)
-- [ ] Large pages and slow pages (p95 thresholds) summary
-
-### Link behavior and graph
-- [ ] Redirect map (from → to, hops; count summary)
-- [ ] Indegree/outdegree stats; small graph summary
+- [ ] Redirect map summary (from → to domain counts)
### Outputs and UX
-- [ ] CSV exports: pages.csv, links.csv, assets.csv
+- [ ] CSV exports: pages.csv, links.csv
- [ ] NDJSON export option for streaming pipelines
-- [ ] Optional: include file/line anchors in JSON for large outputs
### Notes
-- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`.
-- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast.
+- All report metrics must be gathered by default with zero flags required.
+- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change.
diff --git a/internal/report/report.go b/internal/report/report.go
index 8349cda..cbf1dd7 100644
--- a/internal/report/report.go
+++ b/internal/report/report.go
@@ -3,7 +3,9 @@ package report
import (
"fmt"
"io"
+ "net/url"
"sort"
+ "strings"
"urlcrawler/internal/linkcheck"
)
@@ -21,6 +23,10 @@ type Report struct {
Metadata Metadata `json:"metadata"`
Params Params `json:"params"`
Stats Stats `json:"stats"`
+ ReportSummary string `json:"reportSummary,omitempty"`
+ TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
+ BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
+ BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
}
type Metadata struct {
@@ -47,6 +53,11 @@ type Stats struct {
StatusOther int `json:"statusOther"`
}
+type DomainCount struct {
+ Domain string `json:"domain"`
+ Count int `json:"count"`
+}
+
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap))
@@ -92,6 +103,29 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
}
}
+ // Derived summaries
+ tHost := hostLower(target)
+ extCounts := map[string]int{}
+ brokenByDomain := map[string]int{}
+ var brokenSample []linkcheck.LinkStatus
+ for _, ls := range check.Statuses {
+ h := hostLower(ls.URL)
+ if h != "" && !strings.EqualFold(h, tHost) {
+ extCounts[h]++
+ }
+ if !ls.OK {
+ brokenByDomain[h]++
+ if len(brokenSample) < 10 {
+ brokenSample = append(brokenSample, ls)
+ }
+ }
+ }
+ topExternal := mapToSortedSlice(extCounts)
+ brokenBy := mapToSortedSlice(brokenByDomain)
+
+ summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
+ len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
+
return Report{
Target: target,
CrawledURLs: crawledList,
@@ -105,6 +139,10 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
Metadata: meta,
Params: params,
Stats: st,
+ ReportSummary: summary,
+ TopExternalDomains: topExternal,
+ BrokenSample: brokenSample,
+ BrokenByDomain: brokenBy,
}
}
@@ -163,3 +201,31 @@ func sanitizeURL(u string) string {
}
return u
}
+
+func hostLower(raw string) string {
+ u, err := url.Parse(raw)
+ if err != nil {
+ return ""
+ }
+ return strings.ToLower(u.Host)
+}
+
+func mapToSortedSlice(m map[string]int) []DomainCount {
+ if len(m) == 0 {
+ return nil
+ }
+ out := make([]DomainCount, 0, len(m))
+ for k, v := range m {
+ out = append(out, DomainCount{Domain: k, Count: v})
+ }
+ sort.Slice(out, func(i, j int) bool {
+ if out[i].Count == out[j].Count {
+ return out[i].Domain < out[j].Domain
+ }
+ return out[i].Count > out[j].Count
+ })
+ if len(out) > 10 {
+ out = out[:10]
+ }
+ return out
+}
diff --git a/reports/REPORT_SCHEMA.md b/reports/REPORT_SCHEMA.md
index 65d5bc5..0f3f91f 100644
--- a/reports/REPORT_SCHEMA.md
+++ b/reports/REPORT_SCHEMA.md
@@ -91,9 +91,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
- **status5xx** (number)
- **statusOther** (number)
+- **reportSummary** (string): Compact summary string like `crawled=7 sitemap=7 links=26 ok=26 broken=0`.
+- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
+- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
+- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
+
### Notes
- URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output.
+- All metrics described here are included by default; no extra flags are required.
- Content-type filtering: only `text/html` pages are parsed for outlinks.
- Sitemap fetching is best-effort; absence is not treated as an error.
- The JSON lists are sorted to produce stable outputs across runs.