feat(report): reportSummary, topExternalDomains, broken samples/domain counts; docs: schema; chore: prune TODO scope
This commit is contained in:
		
							parent
							
								
									3eb9ab48bf
								
							
						
					
					
						commit
						89adad8ad8
					
				
							
								
								
									
										37
									
								
								TODO.md
								
								
								
								
							
							
						
						
									
										37
									
								
								TODO.md
								
								
								
								
							|  | @ -7,39 +7,24 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship. | ||||||
| - [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly) | - [x] Include run parameters in report (maxDepth, concurrency, timeout, userAgent, sameHostOnly) | ||||||
| - [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary | - [x] Status histogram (2xx/3xx/4xx/5xx totals) in summary | ||||||
| - [x] Normalize and dedupe trailing `/.` URL variants in output | - [x] Normalize and dedupe trailing `/.` URL variants in output | ||||||
| - [ ] Add compact `reportSummary` text block to JSON | - [x] Add compact `reportSummary` text block to JSON | ||||||
| - [ ] Top external domains with counts | - [x] Top external domains with counts | ||||||
| - [ ] Broken links sample (first N) + per-domain broken counts | - [x] Broken links sample (first N) + per-domain broken counts | ||||||
| 
 | 
 | ||||||
| ### Moderate scope | ### Core additions (default, no flags) | ||||||
| - [ ] Robots.txt summary (present, fetchedAt, sample disallow rules) | - [ ] Robots.txt summary (present, fetchedAt) | ||||||
| - [ ] Sitemap extras (index → child sitemaps, fetch errors) | - [ ] Sitemap extras (index → child sitemaps, fetch errors) | ||||||
| - [ ] Per-page response time (responseTimeMs) and content length | - [ ] Per-page response time (responseTimeMs) and content length (basic) | ||||||
| - [ ] Basic page metadata: `<title>`, canonical (if present) | - [ ] Basic page metadata: `<title>` | ||||||
| - [ ] Depth distribution (count of pages by depth) | - [ ] Depth distribution (count of pages by depth) | ||||||
| - [ ] Duplicate title/canonical detection (lists of URLs) | - [ ] Redirect map summary (from → to domain counts) | ||||||
| 
 |  | ||||||
| ### Content/asset analysis |  | ||||||
| - [ ] Extract assets (images/css/js) per page with status/type/size |  | ||||||
| - [ ] Mixed-content detection (http assets on https pages) |  | ||||||
| - [ ] Image accessibility metric (alt present ratio) |  | ||||||
| 
 |  | ||||||
| ### Security and quality signals |  | ||||||
| - [ ] Security headers by host (HSTS, CSP, X-Frame-Options, Referrer-Policy) |  | ||||||
| - [ ] Insecure forms (http action on https page) |  | ||||||
| - [ ] Large pages and slow pages (p95 thresholds) summary |  | ||||||
| 
 |  | ||||||
| ### Link behavior and graph |  | ||||||
| - [ ] Redirect map (from → to, hops; count summary) |  | ||||||
| - [ ] Indegree/outdegree stats; small graph summary |  | ||||||
| 
 | 
 | ||||||
| ### Outputs and UX | ### Outputs and UX | ||||||
| - [ ] CSV exports: pages.csv, links.csv, assets.csv | - [ ] CSV exports: pages.csv, links.csv | ||||||
| - [ ] NDJSON export option for streaming pipelines | - [ ] NDJSON export option for streaming pipelines | ||||||
| - [ ] Optional: include file/line anchors in JSON for large outputs |  | ||||||
| 
 | 
 | ||||||
| ### Notes | ### Notes | ||||||
| - Keep JSON stable and sorted; avoid breaking changes. If we change fields, bump minor version and document in `reports/REPORT_SCHEMA.md`. | - All report metrics must be gathered by default with zero flags required. | ||||||
| - Favor opt-in flags for heavier analyses (assets, headers) to keep default runs fast. | - Keep JSON stable and sorted; update `reports/REPORT_SCHEMA.md` when fields change. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3,7 +3,9 @@ package report | ||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"io" | 	"io" | ||||||
|  | 	"net/url" | ||||||
| 	"sort" | 	"sort" | ||||||
|  | 	"strings" | ||||||
| 
 | 
 | ||||||
| 	"urlcrawler/internal/linkcheck" | 	"urlcrawler/internal/linkcheck" | ||||||
| ) | ) | ||||||
|  | @ -21,6 +23,10 @@ type Report struct { | ||||||
| 	Metadata            Metadata               `json:"metadata"` | 	Metadata            Metadata               `json:"metadata"` | ||||||
| 	Params              Params                 `json:"params"` | 	Params              Params                 `json:"params"` | ||||||
| 	Stats               Stats                  `json:"stats"` | 	Stats               Stats                  `json:"stats"` | ||||||
|  | 	ReportSummary       string                 `json:"reportSummary,omitempty"` | ||||||
|  | 	TopExternalDomains  []DomainCount          `json:"topExternalDomains,omitempty"` | ||||||
|  | 	BrokenSample        []linkcheck.LinkStatus `json:"brokenSample,omitempty"` | ||||||
|  | 	BrokenByDomain      []DomainCount          `json:"brokenByDomain,omitempty"` | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| type Metadata struct { | type Metadata struct { | ||||||
|  | @ -47,6 +53,11 @@ type Stats struct { | ||||||
| 	StatusOther int `json:"statusOther"` | 	StatusOther int `json:"statusOther"` | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | type DomainCount struct { | ||||||
|  | 	Domain string `json:"domain"` | ||||||
|  | 	Count  int    `json:"count"` | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report { | func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report { | ||||||
| 	crawledList := sanitizeURLs(keys(crawled)) | 	crawledList := sanitizeURLs(keys(crawled)) | ||||||
| 	sitemapList := sanitizeURLs(keys(sitemap)) | 	sitemapList := sanitizeURLs(keys(sitemap)) | ||||||
|  | @ -92,6 +103,29 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	// Derived summaries
 | ||||||
|  | 	tHost := hostLower(target) | ||||||
|  | 	extCounts := map[string]int{} | ||||||
|  | 	brokenByDomain := map[string]int{} | ||||||
|  | 	var brokenSample []linkcheck.LinkStatus | ||||||
|  | 	for _, ls := range check.Statuses { | ||||||
|  | 		h := hostLower(ls.URL) | ||||||
|  | 		if h != "" && !strings.EqualFold(h, tHost) { | ||||||
|  | 			extCounts[h]++ | ||||||
|  | 		} | ||||||
|  | 		if !ls.OK { | ||||||
|  | 			brokenByDomain[h]++ | ||||||
|  | 			if len(brokenSample) < 10 { | ||||||
|  | 				brokenSample = append(brokenSample, ls) | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	topExternal := mapToSortedSlice(extCounts) | ||||||
|  | 	brokenBy := mapToSortedSlice(brokenByDomain) | ||||||
|  | 
 | ||||||
|  | 	summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d", | ||||||
|  | 		len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken) | ||||||
|  | 
 | ||||||
| 	return Report{ | 	return Report{ | ||||||
| 		Target:              target, | 		Target:              target, | ||||||
| 		CrawledURLs:         crawledList, | 		CrawledURLs:         crawledList, | ||||||
|  | @ -105,6 +139,10 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct | ||||||
| 		Metadata:            meta, | 		Metadata:            meta, | ||||||
| 		Params:              params, | 		Params:              params, | ||||||
| 		Stats:               st, | 		Stats:               st, | ||||||
|  | 		ReportSummary:       summary, | ||||||
|  | 		TopExternalDomains:  topExternal, | ||||||
|  | 		BrokenSample:        brokenSample, | ||||||
|  | 		BrokenByDomain:      brokenBy, | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -163,3 +201,31 @@ func sanitizeURL(u string) string { | ||||||
| 	} | 	} | ||||||
| 	return u | 	return u | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | func hostLower(raw string) string { | ||||||
|  | 	u, err := url.Parse(raw) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return "" | ||||||
|  | 	} | ||||||
|  | 	return strings.ToLower(u.Host) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | func mapToSortedSlice(m map[string]int) []DomainCount { | ||||||
|  | 	if len(m) == 0 { | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  | 	out := make([]DomainCount, 0, len(m)) | ||||||
|  | 	for k, v := range m { | ||||||
|  | 		out = append(out, DomainCount{Domain: k, Count: v}) | ||||||
|  | 	} | ||||||
|  | 	sort.Slice(out, func(i, j int) bool { | ||||||
|  | 		if out[i].Count == out[j].Count { | ||||||
|  | 			return out[i].Domain < out[j].Domain | ||||||
|  | 		} | ||||||
|  | 		return out[i].Count > out[j].Count | ||||||
|  | 	}) | ||||||
|  | 	if len(out) > 10 { | ||||||
|  | 		out = out[:10] | ||||||
|  | 	} | ||||||
|  | 	return out | ||||||
|  | } | ||||||
|  |  | ||||||
|  | @ -91,9 +91,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle | ||||||
|   - **status5xx** (number) |   - **status5xx** (number) | ||||||
|   - **statusOther** (number) |   - **statusOther** (number) | ||||||
| 
 | 
 | ||||||
|  | - **reportSummary** (string): Compact summary string like `crawled=7 sitemap=7 links=26 ok=26 broken=0`. | ||||||
|  | - **topExternalDomains** (DomainCount[]): Top external domains referenced by links. | ||||||
|  | - **brokenSample** (LinkStatus[]): Up to 10 example broken links. | ||||||
|  | - **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain. | ||||||
|  | 
 | ||||||
| ### Notes | ### Notes | ||||||
| 
 | 
 | ||||||
| - URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output. | - URLs are normalized and deduplicated during crawl. Minor variants like trailing `/.` are normalized in output. | ||||||
|  | - All metrics described here are included by default; no extra flags are required. | ||||||
| - Content-type filtering: only `text/html` pages are parsed for outlinks. | - Content-type filtering: only `text/html` pages are parsed for outlinks. | ||||||
| - Sitemap fetching is best-effort; absence is not treated as an error. | - Sitemap fetching is best-effort; absence is not treated as an error. | ||||||
| - The JSON lists are sorted to produce stable outputs across runs. | - The JSON lists are sorted to produce stable outputs across runs. | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue