diff --git a/TODO.md b/TODO.md index c1671d8..b663944 100644 --- a/TODO.md +++ b/TODO.md @@ -7,39 +7,24 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship. - [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly) - [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary - [x] Normalize and dedupe trailing `/.` URL variants in output -- [ ] Add compact `reportSummary` text block to JSON -- [ ] Top external domains with counts -- [ ] Broken links sample (first N) + per-domain broken counts +- [x] Add compact `reportSummary` text block to JSON +- [x] Top external domains with counts +- [x] Broken links sample (first N) + per-domain broken counts -### Moderate scope -- [ ] Robots.txt summary (present, fetchedAt, sample disallow rules) +### Core additions (default, no flags) +- [ ] Robots.txt summary (present, fetchedAt) - [ ] Sitemap extras (index → child sitemaps, fetch errors) -- [ ] Per-page response time (responseTimeMs) and content length -- [ ] Basic page metadata: ``, canonical (if present) +- [ ] Per-page response time (responseTimeMs) and content length (basic) +- [ ] Basic page metadata: `<title>` - [ ] Depth distribution (count of pages by depth) -- [ ] Duplicate title/canonical detection (lists of URLs) - -### Content/asset analysis -- [ ] Extract assets (images/css/js) per page with status/type/size -- [ ] Mixed-content detection (http assets on https pages) -- [ ] Image accessibility metric (alt present ratio) - -### Security and quality signals -- [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy) -- [ ] Insecure forms (http action on https page) -- [ ] Large pages and slow pages (p95 thresholds) summary - -### Link behavior and graph -- [ ] Redirect map (from → to, hops; count summary) -- [ ] Indegree/outdegree stats; small graph summary +- [ ] Redirect map summary (from → to domain counts) ### Outputs and UX -- [ ] CSV exports: pages.csv, links.csv, assets.csv +- [ ] CSV exports: pages.csv, links.csv - [ ] NDJSON export option for streaming pipelines -- [ ] Optional: include file/line anchors in JSON for large outputs ### Notes -- Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`. -- Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast. +- All report metrics must be gathered by default with zero flags required. +- Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change. diff --git a/internal/report/report.go b/internal/report/report.go index 8349cda..cbf1dd7 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -3,7 +3,9 @@ package report import ( "fmt" "io" + "net/url" "sort" + "strings" "urlcrawler/internal/linkcheck" ) @@ -21,6 +23,10 @@ type Report struct { Metadata Metadata `json:"metadata"` Params Params `json:"params"` Stats Stats `json:"stats"` + ReportSummary string `json:"reportSummary,omitempty"` + TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"` + BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"` + BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"` } type Metadata struct { @@ -47,6 +53,11 @@ type Stats struct { StatusOther int `json:"statusOther"` } +type DomainCount struct { + Domain string `json:"domain"` + Count int `json:"count"` +} + func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report { crawledList := sanitizeURLs(keys(crawled)) sitemapList := sanitizeURLs(keys(sitemap)) @@ -92,6 +103,29 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct } } + // Derived summaries + tHost := hostLower(target) + extCounts := map[string]int{} + brokenByDomain := map[string]int{} + var brokenSample []linkcheck.LinkStatus + for _, ls := range check.Statuses { + h := hostLower(ls.URL) + if h != "" && !strings.EqualFold(h, tHost) { + extCounts[h]++ + } + if !ls.OK { + brokenByDomain[h]++ + if len(brokenSample) < 10 { + brokenSample = append(brokenSample, ls) + } + } + } + topExternal := mapToSortedSlice(extCounts) + brokenBy := mapToSortedSlice(brokenByDomain) + + summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d", + len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken) + return Report{ Target: target, CrawledURLs: crawledList, @@ -105,6 +139,10 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct Metadata: meta, Params: params, Stats: st, + ReportSummary: summary, + TopExternalDomains: topExternal, + BrokenSample: brokenSample, + BrokenByDomain: brokenBy, } } @@ -163,3 +201,31 @@ func sanitizeURL(u string) string { } return u } + +func hostLower(raw string) string { + u, err := url.Parse(raw) + if err != nil { + return "" + } + return strings.ToLower(u.Host) +} + +func mapToSortedSlice(m map[string]int) []DomainCount { + if len(m) == 0 { + return nil + } + out := make([]DomainCount, 0, len(m)) + for k, v := range m { + out = append(out, DomainCount{Domain: k, Count: v}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Count == out[j].Count { + return out[i].Domain < out[j].Domain + } + return out[i].Count > out[j].Count + }) + if len(out) > 10 { + out = out[:10] + } + return out +} diff --git a/reports/REPORT_SCHEMA.md b/reports/REPORT_SCHEMA.md index 65d5bc5..0f3f91f 100644 --- a/reports/REPORT_SCHEMA.md +++ b/reports/REPORT_SCHEMA.md @@ -91,9 +91,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle - **status5xx** (number) - **statusOther** (number) +- **reportSummary** (string): Compact summary string like `crawled=7 sitemap=7 links=26 ok=26 broken=0`. +- **topExternalDomains** (DomainCount[]): Top external domains referenced by links. +- **brokenSample** (LinkStatus[]): Up to 10 example broken links. +- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain. + ### Notes - URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output. +- All metrics described here are included by default; no extra flags are required. - Content-type filtering: only `text/html` pages are parsed for outlinks. - Sitemap fetching is best-effort; absence is not treated as an error. - The JSON lists are sorted to produce stable outputs across runs.