feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO
This commit is contained in:
parent
89adad8ad8
commit
e80e0be97f
12
TODO.md
12
TODO.md
|
@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
|||
- [x] Broken links sample (first N) + per-domain broken counts
|
||||
|
||||
### Core additions (default, no flags)
|
||||
- [ ] Robots.txt summary (present, fetchedAt)
|
||||
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
||||
- [ ] Per-page response time (responseTimeMs) and content length (basic)
|
||||
- [ ] Basic page metadata: `<title>`
|
||||
- [ ] Depth distribution (count of pages by depth)
|
||||
- [ ] Redirect map summary (from → to domain counts)
|
||||
- [x] Robots.txt summary (present, fetchedAt)
|
||||
- [x] Sitemap extras (index → child sitemaps, fetch errors)
|
||||
- [x] Per-page response time (responseTimeMs) and content length (basic)
|
||||
- [x] Basic page metadata: `<title>`
|
||||
- [x] Depth distribution (count of pages by depth)
|
||||
- [x] Redirect map summary (from → to domain counts)
|
||||
|
||||
### Outputs and UX
|
||||
- [ ] CSV exports: pages.csv, links.csv
|
||||
|
|
|
@ -5,6 +5,7 @@ import (
|
|||
"io"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"urlcrawler/internal/htmlx"
|
||||
"urlcrawler/internal/urlutil"
|
||||
|
@ -15,14 +16,22 @@ type task struct {
|
|||
depth int
|
||||
}
|
||||
|
||||
type PageInfo struct {
|
||||
Title string
|
||||
ResponseTimeMs int64
|
||||
ContentLength int
|
||||
Depth int
|
||||
}
|
||||
|
||||
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
||||
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
||||
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
||||
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
||||
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
|
||||
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
|
||||
visited := make(map[string]struct{})
|
||||
errs := make(map[string]error)
|
||||
outlinks := make(map[string]map[string]struct{})
|
||||
pageInfos := make(map[string]PageInfo)
|
||||
var mu sync.Mutex
|
||||
|
||||
origin := urlutil.Origin(startURL)
|
||||
|
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
|||
visitedCallback(tk.url, tk.depth, len(tasks))
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
mu.Lock()
|
||||
errs[tk.url] = err
|
||||
pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||
mu.Unlock()
|
||||
|
||||
if errorCallback != nil {
|
||||
|
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
|||
func() {
|
||||
defer resp.Body.Close()
|
||||
ct := resp.Header.Get("Content-Type")
|
||||
// Default meta values
|
||||
meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||
if resp.ContentLength > 0 {
|
||||
meta.ContentLength = int(resp.ContentLength)
|
||||
}
|
||||
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
||||
mu.Lock()
|
||||
pageInfos[tk.url] = meta
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
meta.ContentLength = len(body)
|
||||
meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
|
||||
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
||||
var toEnqueue []string
|
||||
for _, href := range hrefs {
|
||||
|
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
|||
for _, u := range toEnqueue {
|
||||
enqueue(task{url: u, depth: tk.depth + 1})
|
||||
}
|
||||
mu.Lock()
|
||||
pageInfos[tk.url] = meta
|
||||
mu.Unlock()
|
||||
}()
|
||||
wgTasks.Done()
|
||||
}
|
||||
|
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
|||
enqueue(task{url: startURL, depth: 0})
|
||||
wgWorkers.Wait()
|
||||
|
||||
return visited, errs, outlinks
|
||||
return visited, errs, outlinks, pageInfos
|
||||
}
|
||||
|
||||
func hasPrefix(s string, prefix string) bool {
|
||||
|
|
|
@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractTitle returns the text content of the first <title> element.
|
||||
func ExtractTitle(r io.Reader) string {
|
||||
tokens := html.NewTokenizer(r)
|
||||
for {
|
||||
switch tokens.Next() {
|
||||
case html.StartTagToken:
|
||||
name, _ := tokens.TagName()
|
||||
if string(name) == "title" {
|
||||
if tokens.Next() == html.TextToken {
|
||||
t := strings.TrimSpace(string(tokens.Text()))
|
||||
return t
|
||||
}
|
||||
}
|
||||
case html.ErrorToken:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,9 @@ type Report struct {
|
|||
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
||||
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
||||
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
||||
Pages map[string]PageMeta `json:"pages"`
|
||||
DepthDistribution map[int]int `json:"depthDistribution"`
|
||||
Robots RobotsSummary `json:"robots"`
|
||||
}
|
||||
|
||||
type Metadata struct {
|
||||
|
@ -58,7 +61,19 @@ type DomainCount struct {
|
|||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
|
||||
type PageMeta struct {
|
||||
Title string `json:"title"`
|
||||
ResponseTimeMs int64 `json:"responseTimeMs"`
|
||||
ContentLength int `json:"contentLength"`
|
||||
Depth int `json:"depth"`
|
||||
}
|
||||
|
||||
type RobotsSummary struct {
|
||||
Present bool `json:"present"`
|
||||
FetchedAt string `json:"fetchedAt,omitempty"`
|
||||
}
|
||||
|
||||
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
|
||||
crawledList := sanitizeURLs(keys(crawled))
|
||||
sitemapList := sanitizeURLs(keys(sitemap))
|
||||
crawlErrMap := make(map[string]string, len(crawlErrs))
|
||||
|
@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
|||
topExternal := mapToSortedSlice(extCounts)
|
||||
brokenBy := mapToSortedSlice(brokenByDomain)
|
||||
|
||||
// Depth distribution
|
||||
depthDist := make(map[int]int)
|
||||
for _, pm := range pages {
|
||||
depthDist[pm.Depth]++
|
||||
}
|
||||
|
||||
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
||||
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
||||
|
||||
|
@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
|||
TopExternalDomains: topExternal,
|
||||
BrokenSample: brokenSample,
|
||||
BrokenByDomain: brokenBy,
|
||||
Pages: pages,
|
||||
DepthDistribution: depthDist,
|
||||
Robots: robots,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
33
main.go
33
main.go
|
@ -16,6 +16,7 @@ import (
|
|||
"urlcrawler/internal/linkcheck"
|
||||
"urlcrawler/internal/report"
|
||||
"urlcrawler/internal/sitemap"
|
||||
"urlcrawler/internal/urlutil"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
@ -99,7 +100,7 @@ func main() {
|
|||
currentURL.Store(u)
|
||||
}
|
||||
|
||||
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
||||
visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
||||
|
||||
// Clear progress line before moving to next phase
|
||||
if !quiet {
|
||||
|
@ -113,6 +114,22 @@ func main() {
|
|||
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
||||
}
|
||||
|
||||
// Robots.txt summary (simple)
|
||||
robots := report.RobotsSummary{}
|
||||
robotsURL := urlutil.Origin(target) + "/robots.txt"
|
||||
{
|
||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
resp, err := client.Do(req)
|
||||
if err == nil {
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusOK {
|
||||
robots.Present = true
|
||||
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build set of all unique links discovered across pages for status checks
|
||||
allLinks := make(map[string]struct{})
|
||||
for _, m := range outlinks {
|
||||
|
@ -153,7 +170,19 @@ func main() {
|
|||
meta.DurationMs = finished.Sub(started).Milliseconds()
|
||||
|
||||
fmt.Fprintf(os.Stderr, "Building report...\n")
|
||||
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
|
||||
|
||||
// Convert pageInfo to report.PageMeta
|
||||
pages := make(map[string]report.PageMeta, len(pageInfo))
|
||||
for u, pi := range pageInfo {
|
||||
pages[u] = report.PageMeta{
|
||||
Title: pi.Title,
|
||||
ResponseTimeMs: pi.ResponseTimeMs,
|
||||
ContentLength: pi.ContentLength,
|
||||
Depth: pi.Depth,
|
||||
}
|
||||
}
|
||||
|
||||
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
|
||||
|
||||
switch output {
|
||||
case "json":
|
||||
|
|
|
@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
|
|||
"status4xx": 1,
|
||||
"status5xx": 0,
|
||||
"statusOther": 0
|
||||
}
|
||||
},
|
||||
"reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
|
||||
"topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
|
||||
"brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
|
||||
"brokenByDomain": [{"domain": "other.example", "count": 1}],
|
||||
"pages": {
|
||||
"https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
|
||||
},
|
||||
"depthDistribution": {"0": 1, "1": 3},
|
||||
"robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
|
|||
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
|
||||
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
|
||||
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
|
||||
- **pages** (object map<string,PageMeta>): Per-page metrics.
|
||||
- **title** (string): The page `<title>` text.
|
||||
- **responseTimeMs** (number): Time to fetch the document.
|
||||
- **contentLength** (number): Size of the fetched body in bytes (best effort).
|
||||
- **depth** (number): Crawl depth from the start URL.
|
||||
- **depthDistribution** (object map<number,number>): Count of pages by depth.
|
||||
- **robots** (object): robots.txt summary.
|
||||
- **present** (boolean): True if `robots.txt` exists and returned 200.
|
||||
- **fetchedAt** (string, RFC3339; optional): Fetch time when present.
|
||||
|
||||
### Notes
|
||||
|
||||
|
|
Loading…
Reference in New Issue