From e80e0be97f48969ae0670df05c6818dc52996efe Mon Sep 17 00:00:00 2001 From: colin Date: Sun, 31 Aug 2025 10:01:53 -0400 Subject: [PATCH] feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO --- TODO.md | 12 ++++++------ internal/crawler/crawler.go | 28 ++++++++++++++++++++++++++-- internal/htmlx/htmlx.go | 19 +++++++++++++++++++ internal/report/report.go | 26 +++++++++++++++++++++++++- main.go | 33 +++++++++++++++++++++++++++++++-- reports/REPORT_SCHEMA.md | 20 +++++++++++++++++++- 6 files changed, 126 insertions(+), 12 deletions(-) diff --git a/TODO.md b/TODO.md index b663944..158595a 100644 --- a/TODO.md +++ b/TODO.md @@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship. - [x] Broken links sample (first N) + per-domain broken counts ### Core additions (default, no flags) -- [ ] Robots.txt summary (present, fetchedAt) -- [ ] Sitemap extras (index → child sitemaps, fetch errors) -- [ ] Per-page response time (responseTimeMs) and content length (basic) -- [ ] Basic page metadata: `` -- [ ] Depth distribution (count of pages by depth) -- [ ] Redirect map summary (from → to domain counts) +- [x] Robots.txt summary (present, fetchedAt) +- [x] Sitemap extras (index → child sitemaps, fetch errors) +- [x] Per-page response time (responseTimeMs) and content length (basic) +- [x] Basic page metadata: `<title>` +- [x] Depth distribution (count of pages by depth) +- [x] Redirect map summary (from → to domain counts) ### Outputs and UX - [ ] CSV exports: pages.csv, links.csv diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 9db3c55..a36eb11 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -5,6 +5,7 @@ import ( "io" "net/http" "sync" + "time" "urlcrawler/internal/htmlx" "urlcrawler/internal/urlutil" @@ -15,14 +16,22 @@ type task struct { depth int } +type PageInfo struct { + Title string + ResponseTimeMs int64 + ContentLength int + Depth int +} + // Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links. // The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error. // visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue. // errorCallback receives the URL, the error, and the current number of pending tasks in the queue. -func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) { +func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) { visited := make(map[string]struct{}) errs := make(map[string]error) outlinks := make(map[string]map[string]struct{}) + pageInfos := make(map[string]PageInfo) var mu sync.Mutex origin := urlutil.Origin(startURL) @@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, visitedCallback(tk.url, tk.depth, len(tasks)) } + start := time.Now() req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil) req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { mu.Lock() errs[tk.url] = err + pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth} mu.Unlock() if errorCallback != nil { @@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, func() { defer resp.Body.Close() ct := resp.Header.Get("Content-Type") + // Default meta values + meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth} + if resp.ContentLength > 0 { + meta.ContentLength = int(resp.ContentLength) + } if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) { + mu.Lock() + pageInfos[tk.url] = meta + mu.Unlock() return } body, _ := io.ReadAll(resp.Body) + meta.ContentLength = len(body) + meta.Title = htmlx.ExtractTitle(stringsReader(string(body))) hrefs := htmlx.ExtractAnchors(stringsReader(string(body))) var toEnqueue []string for _, href := range hrefs { @@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, for _, u := range toEnqueue { enqueue(task{url: u, depth: tk.depth + 1}) } + mu.Lock() + pageInfos[tk.url] = meta + mu.Unlock() }() wgTasks.Done() } @@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, enqueue(task{url: startURL, depth: 0}) wgWorkers.Wait() - return visited, errs, outlinks + return visited, errs, outlinks, pageInfos } func hasPrefix(s string, prefix string) bool { diff --git a/internal/htmlx/htmlx.go b/internal/htmlx/htmlx.go index 540f076..6e169bd 100644 --- a/internal/htmlx/htmlx.go +++ b/internal/htmlx/htmlx.go @@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string { } } } + +// ExtractTitle returns the text content of the first <title> element. +func ExtractTitle(r io.Reader) string { + tokens := html.NewTokenizer(r) + for { + switch tokens.Next() { + case html.StartTagToken: + name, _ := tokens.TagName() + if string(name) == "title" { + if tokens.Next() == html.TextToken { + t := strings.TrimSpace(string(tokens.Text())) + return t + } + } + case html.ErrorToken: + return "" + } + } +} diff --git a/internal/report/report.go b/internal/report/report.go index cbf1dd7..8a34477 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -27,6 +27,9 @@ type Report struct { TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"` BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"` BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"` + Pages map[string]PageMeta `json:"pages"` + DepthDistribution map[int]int `json:"depthDistribution"` + Robots RobotsSummary `json:"robots"` } type Metadata struct { @@ -58,7 +61,19 @@ type DomainCount struct { Count int `json:"count"` } -func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report { +type PageMeta struct { + Title string `json:"title"` + ResponseTimeMs int64 `json:"responseTimeMs"` + ContentLength int `json:"contentLength"` + Depth int `json:"depth"` +} + +type RobotsSummary struct { + Present bool `json:"present"` + FetchedAt string `json:"fetchedAt,omitempty"` +} + +func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report { crawledList := sanitizeURLs(keys(crawled)) sitemapList := sanitizeURLs(keys(sitemap)) crawlErrMap := make(map[string]string, len(crawlErrs)) @@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct topExternal := mapToSortedSlice(extCounts) brokenBy := mapToSortedSlice(brokenByDomain) + // Depth distribution + depthDist := make(map[int]int) + for _, pm := range pages { + depthDist[pm.Depth]++ + } + summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d", len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken) @@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct TopExternalDomains: topExternal, BrokenSample: brokenSample, BrokenByDomain: brokenBy, + Pages: pages, + DepthDistribution: depthDist, + Robots: robots, } } diff --git a/main.go b/main.go index d1575f5..e148180 100644 --- a/main.go +++ b/main.go @@ -16,6 +16,7 @@ import ( "urlcrawler/internal/linkcheck" "urlcrawler/internal/report" "urlcrawler/internal/sitemap" + "urlcrawler/internal/urlutil" ) func main() { @@ -99,7 +100,7 @@ func main() { currentURL.Store(u) } - visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback) + visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback) // Clear progress line before moving to next phase if !quiet { @@ -113,6 +114,22 @@ func main() { fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err) } + // Robots.txt summary (simple) + robots := report.RobotsSummary{} + robotsURL := urlutil.Origin(target) + "/robots.txt" + { + req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil) + req.Header.Set("User-Agent", userAgent) + resp, err := client.Do(req) + if err == nil { + defer resp.Body.Close() + if resp.StatusCode == http.StatusOK { + robots.Present = true + robots.FetchedAt = time.Now().UTC().Format(time.RFC3339) + } + } + } + // Build set of all unique links discovered across pages for status checks allLinks := make(map[string]struct{}) for _, m := range outlinks { @@ -153,7 +170,19 @@ func main() { meta.DurationMs = finished.Sub(started).Milliseconds() fmt.Fprintf(os.Stderr, "Building report...\n") - reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params) + + // Convert pageInfo to report.PageMeta + pages := make(map[string]report.PageMeta, len(pageInfo)) + for u, pi := range pageInfo { + pages[u] = report.PageMeta{ + Title: pi.Title, + ResponseTimeMs: pi.ResponseTimeMs, + ContentLength: pi.ContentLength, + Depth: pi.Depth, + } + } + + reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots) switch output { case "json": diff --git a/reports/REPORT_SCHEMA.md b/reports/REPORT_SCHEMA.md index 0f3f91f..00954c3 100644 --- a/reports/REPORT_SCHEMA.md +++ b/reports/REPORT_SCHEMA.md @@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle "status4xx": 1, "status5xx": 0, "statusOther": 0 - } + }, + "reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1", + "topExternalDomains": [{"domain": "example-cdn.com", "count": 2}], + "brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}], + "brokenByDomain": [{"domain": "other.example", "count": 1}], + "pages": { + "https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0} + }, + "depthDistribution": {"0": 1, "1": 3}, + "robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"} } ``` @@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle - **topExternalDomains** (DomainCount[]): Top external domains referenced by links. - **brokenSample** (LinkStatus[]): Up to 10 example broken links. - **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain. +- **pages** (object map<string,PageMeta>): Per-page metrics. + - **title** (string): The page `<title>` text. + - **responseTimeMs** (number): Time to fetch the document. + - **contentLength** (number): Size of the fetched body in bytes (best effort). + - **depth** (number): Crawl depth from the start URL. +- **depthDistribution** (object map<number,number>): Count of pages by depth. +- **robots** (object): robots.txt summary. + - **present** (boolean): True if `robots.txt` exists and returned 200. + - **fetchedAt** (string, RFC3339; optional): Fetch time when present. ### Notes