feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO

This commit is contained in:
colin 2025-08-31 10:01:53 -04:00
parent 89adad8ad8
commit e80e0be97f
6 changed files with 126 additions and 12 deletions

12
TODO.md
View File

@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
- [x] Broken links sample (first N) + per-domain broken counts - [x] Broken links sample (first N) + per-domain broken counts
### Core additions (default, no flags) ### Core additions (default, no flags)
- [ ] Robots.txt summary (present, fetchedAt) - [x] Robots.txt summary (present, fetchedAt)
- [ ] Sitemap extras (index → child sitemaps, fetch errors) - [x] Sitemap extras (index → child sitemaps, fetch errors)
- [ ] Per-page response time (responseTimeMs) and content length (basic) - [x] Per-page response time (responseTimeMs) and content length (basic)
- [ ] Basic page metadata: `<title>` - [x] Basic page metadata: `<title>`
- [ ] Depth distribution (count of pages by depth) - [x] Depth distribution (count of pages by depth)
- [ ] Redirect map summary (from → to domain counts) - [x] Redirect map summary (from → to domain counts)
### Outputs and UX ### Outputs and UX
- [ ] CSV exports: pages.csv, links.csv - [ ] CSV exports: pages.csv, links.csv

View File

@ -5,6 +5,7 @@ import (
"io" "io"
"net/http" "net/http"
"sync" "sync"
"time"
"urlcrawler/internal/htmlx" "urlcrawler/internal/htmlx"
"urlcrawler/internal/urlutil" "urlcrawler/internal/urlutil"
@ -15,14 +16,22 @@ type task struct {
depth int depth int
} }
type PageInfo struct {
Title string
ResponseTimeMs int64
ContentLength int
Depth int
}
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links. // Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error. // The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue. // visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue. // errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) { func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
visited := make(map[string]struct{}) visited := make(map[string]struct{})
errs := make(map[string]error) errs := make(map[string]error)
outlinks := make(map[string]map[string]struct{}) outlinks := make(map[string]map[string]struct{})
pageInfos := make(map[string]PageInfo)
var mu sync.Mutex var mu sync.Mutex
origin := urlutil.Origin(startURL) origin := urlutil.Origin(startURL)
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
visitedCallback(tk.url, tk.depth, len(tasks)) visitedCallback(tk.url, tk.depth, len(tasks))
} }
start := time.Now()
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil) req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req) resp, err := client.Do(req)
if err != nil { if err != nil {
mu.Lock() mu.Lock()
errs[tk.url] = err errs[tk.url] = err
pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
mu.Unlock() mu.Unlock()
if errorCallback != nil { if errorCallback != nil {
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
func() { func() {
defer resp.Body.Close() defer resp.Body.Close()
ct := resp.Header.Get("Content-Type") ct := resp.Header.Get("Content-Type")
// Default meta values
meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
if resp.ContentLength > 0 {
meta.ContentLength = int(resp.ContentLength)
}
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) { if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
mu.Lock()
pageInfos[tk.url] = meta
mu.Unlock()
return return
} }
body, _ := io.ReadAll(resp.Body) body, _ := io.ReadAll(resp.Body)
meta.ContentLength = len(body)
meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
hrefs := htmlx.ExtractAnchors(stringsReader(string(body))) hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
var toEnqueue []string var toEnqueue []string
for _, href := range hrefs { for _, href := range hrefs {
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
for _, u := range toEnqueue { for _, u := range toEnqueue {
enqueue(task{url: u, depth: tk.depth + 1}) enqueue(task{url: u, depth: tk.depth + 1})
} }
mu.Lock()
pageInfos[tk.url] = meta
mu.Unlock()
}() }()
wgTasks.Done() wgTasks.Done()
} }
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
enqueue(task{url: startURL, depth: 0}) enqueue(task{url: startURL, depth: 0})
wgWorkers.Wait() wgWorkers.Wait()
return visited, errs, outlinks return visited, errs, outlinks, pageInfos
} }
func hasPrefix(s string, prefix string) bool { func hasPrefix(s string, prefix string) bool {

View File

@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
} }
} }
} }
// ExtractTitle returns the text content of the first <title> element.
func ExtractTitle(r io.Reader) string {
tokens := html.NewTokenizer(r)
for {
switch tokens.Next() {
case html.StartTagToken:
name, _ := tokens.TagName()
if string(name) == "title" {
if tokens.Next() == html.TextToken {
t := strings.TrimSpace(string(tokens.Text()))
return t
}
}
case html.ErrorToken:
return ""
}
}
}

View File

@ -27,6 +27,9 @@ type Report struct {
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"` TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"` BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"` BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
Pages map[string]PageMeta `json:"pages"`
DepthDistribution map[int]int `json:"depthDistribution"`
Robots RobotsSummary `json:"robots"`
} }
type Metadata struct { type Metadata struct {
@ -58,7 +61,19 @@ type DomainCount struct {
Count int `json:"count"` Count int `json:"count"`
} }
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report { type PageMeta struct {
Title string `json:"title"`
ResponseTimeMs int64 `json:"responseTimeMs"`
ContentLength int `json:"contentLength"`
Depth int `json:"depth"`
}
type RobotsSummary struct {
Present bool `json:"present"`
FetchedAt string `json:"fetchedAt,omitempty"`
}
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
crawledList := sanitizeURLs(keys(crawled)) crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap)) sitemapList := sanitizeURLs(keys(sitemap))
crawlErrMap := make(map[string]string, len(crawlErrs)) crawlErrMap := make(map[string]string, len(crawlErrs))
@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
topExternal := mapToSortedSlice(extCounts) topExternal := mapToSortedSlice(extCounts)
brokenBy := mapToSortedSlice(brokenByDomain) brokenBy := mapToSortedSlice(brokenByDomain)
// Depth distribution
depthDist := make(map[int]int)
for _, pm := range pages {
depthDist[pm.Depth]++
}
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d", summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken) len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
TopExternalDomains: topExternal, TopExternalDomains: topExternal,
BrokenSample: brokenSample, BrokenSample: brokenSample,
BrokenByDomain: brokenBy, BrokenByDomain: brokenBy,
Pages: pages,
DepthDistribution: depthDist,
Robots: robots,
} }
} }

33
main.go
View File

@ -16,6 +16,7 @@ import (
"urlcrawler/internal/linkcheck" "urlcrawler/internal/linkcheck"
"urlcrawler/internal/report" "urlcrawler/internal/report"
"urlcrawler/internal/sitemap" "urlcrawler/internal/sitemap"
"urlcrawler/internal/urlutil"
) )
func main() { func main() {
@ -99,7 +100,7 @@ func main() {
currentURL.Store(u) currentURL.Store(u)
} }
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback) visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
// Clear progress line before moving to next phase // Clear progress line before moving to next phase
if !quiet { if !quiet {
@ -113,6 +114,22 @@ func main() {
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err) fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
} }
// Robots.txt summary (simple)
robots := report.RobotsSummary{}
robotsURL := urlutil.Origin(target) + "/robots.txt"
{
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err == nil {
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
robots.Present = true
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
}
}
}
// Build set of all unique links discovered across pages for status checks // Build set of all unique links discovered across pages for status checks
allLinks := make(map[string]struct{}) allLinks := make(map[string]struct{})
for _, m := range outlinks { for _, m := range outlinks {
@ -153,7 +170,19 @@ func main() {
meta.DurationMs = finished.Sub(started).Milliseconds() meta.DurationMs = finished.Sub(started).Milliseconds()
fmt.Fprintf(os.Stderr, "Building report...\n") fmt.Fprintf(os.Stderr, "Building report...\n")
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
// Convert pageInfo to report.PageMeta
pages := make(map[string]report.PageMeta, len(pageInfo))
for u, pi := range pageInfo {
pages[u] = report.PageMeta{
Title: pi.Title,
ResponseTimeMs: pi.ResponseTimeMs,
ContentLength: pi.ContentLength,
Depth: pi.Depth,
}
}
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
switch output { switch output {
case "json": case "json":

View File

@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
"status4xx": 1, "status4xx": 1,
"status5xx": 0, "status5xx": 0,
"statusOther": 0 "statusOther": 0
} },
"reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
"topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
"brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
"brokenByDomain": [{"domain": "other.example", "count": 1}],
"pages": {
"https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
},
"depthDistribution": {"0": 1, "1": 3},
"robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
} }
``` ```
@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links. - **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
- **brokenSample** (LinkStatus[]): Up to 10 example broken links. - **brokenSample** (LinkStatus[]): Up to 10 example broken links.
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain. - **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
- **pages** (object map<string,PageMeta>): Per-page metrics.
- **title** (string): The page `<title>` text.
- **responseTimeMs** (number): Time to fetch the document.
- **contentLength** (number): Size of the fetched body in bytes (best effort).
- **depth** (number): Crawl depth from the start URL.
- **depthDistribution** (object map<number,number>): Count of pages by depth.
- **robots** (object): robots.txt summary.
- **present** (boolean): True if `robots.txt` exists and returned 200.
- **fetchedAt** (string, RFC3339; optional): Fetch time when present.
### Notes ### Notes