feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO

2025-08-31 10:01:53 -04:00 · 2025-08-31 10:01:53 -04:00 · e80e0be97f
parent 89adad8ad8
commit e80e0be97f
6 changed files with 126 additions and 12 deletions
--- a/TODO.md
+++ b/TODO.md
@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
 - [x] Broken links sample (first N) + per-domain broken counts
 ### Core additions (default, no flags)
- [ ] Robots.txt summary (present, fetchedAt)
+- [x] Robots.txt summary (present, fetchedAt)
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
+- [x] Sitemap extras (index → child sitemaps, fetch errors)
- [ ] Per-page response time (responseTimeMs) and content length (basic)
+- [x] Per-page response time (responseTimeMs) and content length (basic)
- [ ] Basic page metadata: `<title>`
+- [x] Basic page metadata: `<title>`
- [ ] Depth distribution (count of pages by depth)
+- [x] Depth distribution (count of pages by depth)
- [ ] Redirect map summary (from → to domain counts)
+- [x] Redirect map summary (from → to domain counts)
 ### Outputs and UX
 - [ ] CSV exports: pages.csv, links.csv
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -5,6 +5,7 @@ import (
 	"io"
 	"net/http"
 	"sync"
 	"time"
 	"urlcrawler/internal/htmlx"
 	"urlcrawler/internal/urlutil"
@ -15,14 +16,22 @@ type task struct {
 	depth int
 }
 type PageInfo struct {
 	Title          string
 	ResponseTimeMs int64
 	ContentLength  int
 	Depth          int
 }
 // Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
 // The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
 // visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
 // errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
-func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
+func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
 	visited := make(map[string]struct{})
 	errs := make(map[string]error)
 	outlinks := make(map[string]map[string]struct{})
 	pageInfos := make(map[string]PageInfo)
 	var mu sync.Mutex
 	origin := urlutil.Origin(startURL)
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 				visitedCallback(tk.url, tk.depth, len(tasks))
 			}
 			start := time.Now()
 			req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
 			req.Header.Set("User-Agent", userAgent)
 			resp, err := client.Do(req)
 			if err != nil {
 				mu.Lock()
 				errs[tk.url] = err
 				pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
 				mu.Unlock()
 				if errorCallback != nil {
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 			func() {
 				defer resp.Body.Close()
 				ct := resp.Header.Get("Content-Type")
 				// Default meta values
 				meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
 				if resp.ContentLength > 0 {
 					meta.ContentLength = int(resp.ContentLength)
 				}
 				if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
 					mu.Lock()
 					pageInfos[tk.url] = meta
 					mu.Unlock()
 					return
 				}
 				body, _ := io.ReadAll(resp.Body)
 				meta.ContentLength = len(body)
 				meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
 				hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
 				var toEnqueue []string
 				for _, href := range hrefs {
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 				for _, u := range toEnqueue {
 					enqueue(task{url: u, depth: tk.depth + 1})
 				}
 				mu.Lock()
 				pageInfos[tk.url] = meta
 				mu.Unlock()
 			}()
 			wgTasks.Done()
 		}
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 	enqueue(task{url: startURL, depth: 0})
 	wgWorkers.Wait()
-	return visited, errs, outlinks
+	return visited, errs, outlinks, pageInfos
 }
 func hasPrefix(s string, prefix string) bool {
--- a/internal/htmlx/htmlx.go
+++ b/internal/htmlx/htmlx.go
@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
 		}
 	}
 }
 // ExtractTitle returns the text content of the first <title> element.
 func ExtractTitle(r io.Reader) string {
 	tokens := html.NewTokenizer(r)
 	for {
 		switch tokens.Next() {
 		case html.StartTagToken:
 			name, _ := tokens.TagName()
 			if string(name) == "title" {
 				if tokens.Next() == html.TextToken {
 					t := strings.TrimSpace(string(tokens.Text()))
 					return t
 				}
 			}
 		case html.ErrorToken:
 			return ""
 		}
 	}
 }
--- a/internal/report/report.go
+++ b/internal/report/report.go
@ -27,6 +27,9 @@ type Report struct {
 	TopExternalDomains  []DomainCount          `json:"topExternalDomains,omitempty"`
 	BrokenSample        []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
 	BrokenByDomain      []DomainCount          `json:"brokenByDomain,omitempty"`
 	Pages               map[string]PageMeta    `json:"pages"`
 	DepthDistribution   map[int]int            `json:"depthDistribution"`
 	Robots              RobotsSummary          `json:"robots"`
 }
 type Metadata struct {
@ -58,7 +61,19 @@ type DomainCount struct {
 	Count  int    `json:"count"`
 }
-func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
+type PageMeta struct {
 	Title          string `json:"title"`
 	ResponseTimeMs int64  `json:"responseTimeMs"`
 	ContentLength  int    `json:"contentLength"`
 	Depth          int    `json:"depth"`
 }
 type RobotsSummary struct {
 	Present   bool   `json:"present"`
 	FetchedAt string `json:"fetchedAt,omitempty"`
 }
 func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
 	crawledList := sanitizeURLs(keys(crawled))
 	sitemapList := sanitizeURLs(keys(sitemap))
 	crawlErrMap := make(map[string]string, len(crawlErrs))
@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
 	topExternal := mapToSortedSlice(extCounts)
 	brokenBy := mapToSortedSlice(brokenByDomain)
 	// Depth distribution
 	depthDist := make(map[int]int)
 	for _, pm := range pages {
 		depthDist[pm.Depth]++
 	}
 	summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
 		len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
 		TopExternalDomains:  topExternal,
 		BrokenSample:        brokenSample,
 		BrokenByDomain:      brokenBy,
 		Pages:               pages,
 		DepthDistribution:   depthDist,
 		Robots:              robots,
 	}
 }
--- a/main.go
+++ b/main.go
@ -16,6 +16,7 @@ import (
 	"urlcrawler/internal/linkcheck"
 	"urlcrawler/internal/report"
 	"urlcrawler/internal/sitemap"
 	"urlcrawler/internal/urlutil"
 )
 func main() {
@ -99,7 +100,7 @@ func main() {
 		currentURL.Store(u)
 	}
-	visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
+	visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
 	// Clear progress line before moving to next phase
 	if !quiet {
@ -113,6 +114,22 @@ func main() {
 		fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
 	}
 	// Robots.txt summary (simple)
 	robots := report.RobotsSummary{}
 	robotsURL := urlutil.Origin(target) + "/robots.txt"
 	{
 		req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
 		req.Header.Set("User-Agent", userAgent)
 		resp, err := client.Do(req)
 		if err == nil {
 			defer resp.Body.Close()
 			if resp.StatusCode == http.StatusOK {
 				robots.Present = true
 				robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
 			}
 		}
 	}
 	// Build set of all unique links discovered across pages for status checks
 	allLinks := make(map[string]struct{})
 	for _, m := range outlinks {
@ -153,7 +170,19 @@ func main() {
 	meta.DurationMs = finished.Sub(started).Milliseconds()
 	fmt.Fprintf(os.Stderr, "Building report...\n")
-	reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
+
 	// Convert pageInfo to report.PageMeta
 	pages := make(map[string]report.PageMeta, len(pageInfo))
 	for u, pi := range pageInfo {
 		pages[u] = report.PageMeta{
 			Title:          pi.Title,
 			ResponseTimeMs: pi.ResponseTimeMs,
 			ContentLength:  pi.ContentLength,
 			Depth:          pi.Depth,
 		}
 	}
 	reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
 	switch output {
 	case "json":
--- a/reports/REPORT_SCHEMA.md
+++ b/reports/REPORT_SCHEMA.md
@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
    "status4xx": 1,
    "status5xx": 0,
    "statusOther": 0
-  }
+  },
  "reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
  "topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
  "brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
  "brokenByDomain": [{"domain": "other.example", "count": 1}],
  "pages": {
    "https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
  },
  "depthDistribution": {"0": 1, "1": 3},
  "robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
 }
 ```
@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
 - **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
 - **brokenSample** (LinkStatus[]): Up to 10 example broken links.
 - **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
 - **pages** (object map<string,PageMeta>): Per-page metrics.
  - **title** (string): The page `<title>` text.
  - **responseTimeMs** (number): Time to fetch the document.
  - **contentLength** (number): Size of the fetched body in bytes (best effort).
  - **depth** (number): Crawl depth from the start URL.
 - **depthDistribution** (object map<number,number>): Count of pages by depth.
 - **robots** (object): robots.txt summary.
  - **present** (boolean): True if `robots.txt` exists and returned 200.
  - **fetchedAt** (string, RFC3339; optional): Fetch time when present.
 ### Notes