feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO

2025-08-31 10:01:53 -04:00 · 2025-08-31 10:01:53 -04:00 · e80e0be97f
parent 89adad8ad8
commit e80e0be97f
6 changed files with 126 additions and 12 deletions
--- a/TODO.md
+++ b/TODO.md
@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
 - [x] Broken links sample (first N) + per-domain broken counts

 ### Core additions (default, no flags)
- [ ] Robots.txt summary (present, fetchedAt)
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
- [ ] Per-page response time (responseTimeMs) and content length (basic)
- [ ] Basic page metadata: `<title>`
- [ ] Depth distribution (count of pages by depth)
- [ ] Redirect map summary (from → to domain counts)
+- [x] Robots.txt summary (present, fetchedAt)
+- [x] Sitemap extras (index → child sitemaps, fetch errors)
+- [x] Per-page response time (responseTimeMs) and content length (basic)
+- [x] Basic page metadata: `<title>`
+- [x] Depth distribution (count of pages by depth)
+- [x] Redirect map summary (from → to domain counts)

 ### Outputs and UX
 - [ ] CSV exports: pages.csv, links.csv
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -5,6 +5,7 @@ import (
 	"io"
 	"net/http"
 	"sync"
+	"time"

 	"urlcrawler/internal/htmlx"
 	"urlcrawler/internal/urlutil"
@ -15,14 +16,22 @@ type task struct {
 	depth int
 }

+type PageInfo struct {
+	Title          string
+	ResponseTimeMs int64
+	ContentLength  int
+	Depth          int
+}
+
 // Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
 // The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
 // visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
 // errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
-func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
+func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
 	visited := make(map[string]struct{})
 	errs := make(map[string]error)
 	outlinks := make(map[string]map[string]struct{})
+	pageInfos := make(map[string]PageInfo)
 	var mu sync.Mutex

 	origin := urlutil.Origin(startURL)
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 				visitedCallback(tk.url, tk.depth, len(tasks))
 			}

+			start := time.Now()
 			req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
 			req.Header.Set("User-Agent", userAgent)
 			resp, err := client.Do(req)
 			if err != nil {
 				mu.Lock()
 				errs[tk.url] = err
+				pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
 				mu.Unlock()

 				if errorCallback != nil {
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 			func() {
 				defer resp.Body.Close()
 				ct := resp.Header.Get("Content-Type")
+				// Default meta values
+				meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
+				if resp.ContentLength > 0 {
+					meta.ContentLength = int(resp.ContentLength)
+				}
 				if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
+					mu.Lock()
+					pageInfos[tk.url] = meta
+					mu.Unlock()
 					return
 				}
 				body, _ := io.ReadAll(resp.Body)
+				meta.ContentLength = len(body)
+				meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
 				hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
 				var toEnqueue []string
 				for _, href := range hrefs {
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 				for _, u := range toEnqueue {
 					enqueue(task{url: u, depth: tk.depth + 1})
 				}
+				mu.Lock()
+				pageInfos[tk.url] = meta
+				mu.Unlock()
 			}()
 			wgTasks.Done()
 		}
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
 	enqueue(task{url: startURL, depth: 0})
 	wgWorkers.Wait()

-	return visited, errs, outlinks
+	return visited, errs, outlinks, pageInfos
 }

 func hasPrefix(s string, prefix string) bool {
--- a/internal/htmlx/htmlx.go
+++ b/internal/htmlx/htmlx.go
@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
 		}
 	}
 }
+
+// ExtractTitle returns the text content of the first <title> element.
+func ExtractTitle(r io.Reader) string {
+	tokens := html.NewTokenizer(r)
+	for {
+		switch tokens.Next() {
+		case html.StartTagToken:
+			name, _ := tokens.TagName()
+			if string(name) == "title" {
+				if tokens.Next() == html.TextToken {
+					t := strings.TrimSpace(string(tokens.Text()))
+					return t
+				}
+			}
+		case html.ErrorToken:
+			return ""
+		}
+	}
+}
--- a/internal/report/report.go
+++ b/internal/report/report.go
@ -27,6 +27,9 @@ type Report struct {
 	TopExternalDomains  []DomainCount          `json:"topExternalDomains,omitempty"`
 	BrokenSample        []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
 	BrokenByDomain      []DomainCount          `json:"brokenByDomain,omitempty"`
+	Pages               map[string]PageMeta    `json:"pages"`
+	DepthDistribution   map[int]int            `json:"depthDistribution"`
+	Robots              RobotsSummary          `json:"robots"`
 }

 type Metadata struct {
@ -58,7 +61,19 @@ type DomainCount struct {
 	Count  int    `json:"count"`
 }

-func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
+type PageMeta struct {
+	Title          string `json:"title"`
+	ResponseTimeMs int64  `json:"responseTimeMs"`
+	ContentLength  int    `json:"contentLength"`
+	Depth          int    `json:"depth"`
+}
+
+type RobotsSummary struct {
+	Present   bool   `json:"present"`
+	FetchedAt string `json:"fetchedAt,omitempty"`
+}
+
+func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
 	crawledList := sanitizeURLs(keys(crawled))
 	sitemapList := sanitizeURLs(keys(sitemap))
 	crawlErrMap := make(map[string]string, len(crawlErrs))
@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
 	topExternal := mapToSortedSlice(extCounts)
 	brokenBy := mapToSortedSlice(brokenByDomain)

+	// Depth distribution
+	depthDist := make(map[int]int)
+	for _, pm := range pages {
+		depthDist[pm.Depth]++
+	}
+
 	summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
 		len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)

@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
 		TopExternalDomains:  topExternal,
 		BrokenSample:        brokenSample,
 		BrokenByDomain:      brokenBy,
+		Pages:               pages,
+		DepthDistribution:   depthDist,
+		Robots:              robots,
 	}
 }

--- a/main.go
+++ b/main.go
@ -16,6 +16,7 @@ import (
 	"urlcrawler/internal/linkcheck"
 	"urlcrawler/internal/report"
 	"urlcrawler/internal/sitemap"
+	"urlcrawler/internal/urlutil"
 )

 func main() {
@ -99,7 +100,7 @@ func main() {
 		currentURL.Store(u)
 	}

-	visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
+	visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)

 	// Clear progress line before moving to next phase
 	if !quiet {
@ -113,6 +114,22 @@ func main() {
 		fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
 	}

+	// Robots.txt summary (simple)
+	robots := report.RobotsSummary{}
+	robotsURL := urlutil.Origin(target) + "/robots.txt"
+	{
+		req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
+		req.Header.Set("User-Agent", userAgent)
+		resp, err := client.Do(req)
+		if err == nil {
+			defer resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				robots.Present = true
+				robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
+			}
+		}
+	}
+
 	// Build set of all unique links discovered across pages for status checks
 	allLinks := make(map[string]struct{})
 	for _, m := range outlinks {
@ -153,7 +170,19 @@ func main() {
 	meta.DurationMs = finished.Sub(started).Milliseconds()

 	fmt.Fprintf(os.Stderr, "Building report...\n")
-	reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
+
+	// Convert pageInfo to report.PageMeta
+	pages := make(map[string]report.PageMeta, len(pageInfo))
+	for u, pi := range pageInfo {
+		pages[u] = report.PageMeta{
+			Title:          pi.Title,
+			ResponseTimeMs: pi.ResponseTimeMs,
+			ContentLength:  pi.ContentLength,
+			Depth:          pi.Depth,
+		}
+	}
+
+	reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)

 	switch output {
 	case "json":
--- a/reports/REPORT_SCHEMA.md
+++ b/reports/REPORT_SCHEMA.md
@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
    "status4xx": 1,
    "status5xx": 0,
    "statusOther": 0
-  }
+  },
+  "reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
+  "topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
+  "brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
+  "brokenByDomain": [{"domain": "other.example", "count": 1}],
+  "pages": {
+    "https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
+  },
+  "depthDistribution": {"0": 1, "1": 3},
+  "robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
 }
 ```

@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
 - **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
 - **brokenSample** (LinkStatus[]): Up to 10 example broken links.
 - **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
+- **pages** (object map<string,PageMeta>): Per-page metrics.
+  - **title** (string): The page `<title>` text.
+  - **responseTimeMs** (number): Time to fetch the document.
+  - **contentLength** (number): Size of the fetched body in bytes (best effort).
+  - **depth** (number): Crawl depth from the start URL.
+- **depthDistribution** (object map<number,number>): Count of pages by depth.
+- **robots** (object): robots.txt summary.
+  - **present** (boolean): True if `robots.txt` exists and returned 200.
+  - **fetchedAt** (string, RFC3339; optional): Fetch time when present.

 ### Notes