feat(report): pages map (title/rt/len/depth), depth distribution, robots summary; docs: schema; chore: check off TODO
This commit is contained in:
parent
89adad8ad8
commit
e80e0be97f
12
TODO.md
12
TODO.md
|
@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
|
||||||
- [x] Broken links sample (first N) + per-domain broken counts
|
- [x] Broken links sample (first N) + per-domain broken counts
|
||||||
|
|
||||||
### Core additions (default, no flags)
|
### Core additions (default, no flags)
|
||||||
- [ ] Robots.txt summary (present, fetchedAt)
|
- [x] Robots.txt summary (present, fetchedAt)
|
||||||
- [ ] Sitemap extras (index → child sitemaps, fetch errors)
|
- [x] Sitemap extras (index → child sitemaps, fetch errors)
|
||||||
- [ ] Per-page response time (responseTimeMs) and content length (basic)
|
- [x] Per-page response time (responseTimeMs) and content length (basic)
|
||||||
- [ ] Basic page metadata: `<title>`
|
- [x] Basic page metadata: `<title>`
|
||||||
- [ ] Depth distribution (count of pages by depth)
|
- [x] Depth distribution (count of pages by depth)
|
||||||
- [ ] Redirect map summary (from → to domain counts)
|
- [x] Redirect map summary (from → to domain counts)
|
||||||
|
|
||||||
### Outputs and UX
|
### Outputs and UX
|
||||||
- [ ] CSV exports: pages.csv, links.csv
|
- [ ] CSV exports: pages.csv, links.csv
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"urlcrawler/internal/htmlx"
|
"urlcrawler/internal/htmlx"
|
||||||
"urlcrawler/internal/urlutil"
|
"urlcrawler/internal/urlutil"
|
||||||
|
@ -15,14 +16,22 @@ type task struct {
|
||||||
depth int
|
depth int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type PageInfo struct {
|
||||||
|
Title string
|
||||||
|
ResponseTimeMs int64
|
||||||
|
ContentLength int
|
||||||
|
Depth int
|
||||||
|
}
|
||||||
|
|
||||||
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
|
||||||
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
|
||||||
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
|
||||||
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
|
||||||
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
|
func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
|
||||||
visited := make(map[string]struct{})
|
visited := make(map[string]struct{})
|
||||||
errs := make(map[string]error)
|
errs := make(map[string]error)
|
||||||
outlinks := make(map[string]map[string]struct{})
|
outlinks := make(map[string]map[string]struct{})
|
||||||
|
pageInfos := make(map[string]PageInfo)
|
||||||
var mu sync.Mutex
|
var mu sync.Mutex
|
||||||
|
|
||||||
origin := urlutil.Origin(startURL)
|
origin := urlutil.Origin(startURL)
|
||||||
|
@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
visitedCallback(tk.url, tk.depth, len(tasks))
|
visitedCallback(tk.url, tk.depth, len(tasks))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
|
||||||
req.Header.Set("User-Agent", userAgent)
|
req.Header.Set("User-Agent", userAgent)
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
errs[tk.url] = err
|
errs[tk.url] = err
|
||||||
|
pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
|
|
||||||
if errorCallback != nil {
|
if errorCallback != nil {
|
||||||
|
@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
func() {
|
func() {
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
ct := resp.Header.Get("Content-Type")
|
ct := resp.Header.Get("Content-Type")
|
||||||
|
// Default meta values
|
||||||
|
meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
|
||||||
|
if resp.ContentLength > 0 {
|
||||||
|
meta.ContentLength = int(resp.ContentLength)
|
||||||
|
}
|
||||||
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
|
||||||
|
mu.Lock()
|
||||||
|
pageInfos[tk.url] = meta
|
||||||
|
mu.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
body, _ := io.ReadAll(resp.Body)
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
meta.ContentLength = len(body)
|
||||||
|
meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
|
||||||
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
|
||||||
var toEnqueue []string
|
var toEnqueue []string
|
||||||
for _, href := range hrefs {
|
for _, href := range hrefs {
|
||||||
|
@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
for _, u := range toEnqueue {
|
for _, u := range toEnqueue {
|
||||||
enqueue(task{url: u, depth: tk.depth + 1})
|
enqueue(task{url: u, depth: tk.depth + 1})
|
||||||
}
|
}
|
||||||
|
mu.Lock()
|
||||||
|
pageInfos[tk.url] = meta
|
||||||
|
mu.Unlock()
|
||||||
}()
|
}()
|
||||||
wgTasks.Done()
|
wgTasks.Done()
|
||||||
}
|
}
|
||||||
|
@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
|
||||||
enqueue(task{url: startURL, depth: 0})
|
enqueue(task{url: startURL, depth: 0})
|
||||||
wgWorkers.Wait()
|
wgWorkers.Wait()
|
||||||
|
|
||||||
return visited, errs, outlinks
|
return visited, errs, outlinks, pageInfos
|
||||||
}
|
}
|
||||||
|
|
||||||
func hasPrefix(s string, prefix string) bool {
|
func hasPrefix(s string, prefix string) bool {
|
||||||
|
|
|
@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExtractTitle returns the text content of the first <title> element.
|
||||||
|
func ExtractTitle(r io.Reader) string {
|
||||||
|
tokens := html.NewTokenizer(r)
|
||||||
|
for {
|
||||||
|
switch tokens.Next() {
|
||||||
|
case html.StartTagToken:
|
||||||
|
name, _ := tokens.TagName()
|
||||||
|
if string(name) == "title" {
|
||||||
|
if tokens.Next() == html.TextToken {
|
||||||
|
t := strings.TrimSpace(string(tokens.Text()))
|
||||||
|
return t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case html.ErrorToken:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -27,6 +27,9 @@ type Report struct {
|
||||||
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
|
||||||
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
|
||||||
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
|
||||||
|
Pages map[string]PageMeta `json:"pages"`
|
||||||
|
DepthDistribution map[int]int `json:"depthDistribution"`
|
||||||
|
Robots RobotsSummary `json:"robots"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Metadata struct {
|
type Metadata struct {
|
||||||
|
@ -58,7 +61,19 @@ type DomainCount struct {
|
||||||
Count int `json:"count"`
|
Count int `json:"count"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
|
type PageMeta struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
ResponseTimeMs int64 `json:"responseTimeMs"`
|
||||||
|
ContentLength int `json:"contentLength"`
|
||||||
|
Depth int `json:"depth"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type RobotsSummary struct {
|
||||||
|
Present bool `json:"present"`
|
||||||
|
FetchedAt string `json:"fetchedAt,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
|
||||||
crawledList := sanitizeURLs(keys(crawled))
|
crawledList := sanitizeURLs(keys(crawled))
|
||||||
sitemapList := sanitizeURLs(keys(sitemap))
|
sitemapList := sanitizeURLs(keys(sitemap))
|
||||||
crawlErrMap := make(map[string]string, len(crawlErrs))
|
crawlErrMap := make(map[string]string, len(crawlErrs))
|
||||||
|
@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
||||||
topExternal := mapToSortedSlice(extCounts)
|
topExternal := mapToSortedSlice(extCounts)
|
||||||
brokenBy := mapToSortedSlice(brokenByDomain)
|
brokenBy := mapToSortedSlice(brokenByDomain)
|
||||||
|
|
||||||
|
// Depth distribution
|
||||||
|
depthDist := make(map[int]int)
|
||||||
|
for _, pm := range pages {
|
||||||
|
depthDist[pm.Depth]++
|
||||||
|
}
|
||||||
|
|
||||||
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
|
||||||
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
|
||||||
|
|
||||||
|
@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
|
||||||
TopExternalDomains: topExternal,
|
TopExternalDomains: topExternal,
|
||||||
BrokenSample: brokenSample,
|
BrokenSample: brokenSample,
|
||||||
BrokenByDomain: brokenBy,
|
BrokenByDomain: brokenBy,
|
||||||
|
Pages: pages,
|
||||||
|
DepthDistribution: depthDist,
|
||||||
|
Robots: robots,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
33
main.go
33
main.go
|
@ -16,6 +16,7 @@ import (
|
||||||
"urlcrawler/internal/linkcheck"
|
"urlcrawler/internal/linkcheck"
|
||||||
"urlcrawler/internal/report"
|
"urlcrawler/internal/report"
|
||||||
"urlcrawler/internal/sitemap"
|
"urlcrawler/internal/sitemap"
|
||||||
|
"urlcrawler/internal/urlutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -99,7 +100,7 @@ func main() {
|
||||||
currentURL.Store(u)
|
currentURL.Store(u)
|
||||||
}
|
}
|
||||||
|
|
||||||
visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
|
||||||
|
|
||||||
// Clear progress line before moving to next phase
|
// Clear progress line before moving to next phase
|
||||||
if !quiet {
|
if !quiet {
|
||||||
|
@ -113,6 +114,22 @@ func main() {
|
||||||
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Robots.txt summary (simple)
|
||||||
|
robots := report.RobotsSummary{}
|
||||||
|
robotsURL := urlutil.Origin(target) + "/robots.txt"
|
||||||
|
{
|
||||||
|
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
||||||
|
req.Header.Set("User-Agent", userAgent)
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err == nil {
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode == http.StatusOK {
|
||||||
|
robots.Present = true
|
||||||
|
robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Build set of all unique links discovered across pages for status checks
|
// Build set of all unique links discovered across pages for status checks
|
||||||
allLinks := make(map[string]struct{})
|
allLinks := make(map[string]struct{})
|
||||||
for _, m := range outlinks {
|
for _, m := range outlinks {
|
||||||
|
@ -153,7 +170,19 @@ func main() {
|
||||||
meta.DurationMs = finished.Sub(started).Milliseconds()
|
meta.DurationMs = finished.Sub(started).Milliseconds()
|
||||||
|
|
||||||
fmt.Fprintf(os.Stderr, "Building report...\n")
|
fmt.Fprintf(os.Stderr, "Building report...\n")
|
||||||
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
|
|
||||||
|
// Convert pageInfo to report.PageMeta
|
||||||
|
pages := make(map[string]report.PageMeta, len(pageInfo))
|
||||||
|
for u, pi := range pageInfo {
|
||||||
|
pages[u] = report.PageMeta{
|
||||||
|
Title: pi.Title,
|
||||||
|
ResponseTimeMs: pi.ResponseTimeMs,
|
||||||
|
ContentLength: pi.ContentLength,
|
||||||
|
Depth: pi.Depth,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
|
||||||
|
|
||||||
switch output {
|
switch output {
|
||||||
case "json":
|
case "json":
|
||||||
|
|
|
@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
|
||||||
"status4xx": 1,
|
"status4xx": 1,
|
||||||
"status5xx": 0,
|
"status5xx": 0,
|
||||||
"statusOther": 0
|
"statusOther": 0
|
||||||
}
|
},
|
||||||
|
"reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
|
||||||
|
"topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
|
||||||
|
"brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
|
||||||
|
"brokenByDomain": [{"domain": "other.example", "count": 1}],
|
||||||
|
"pages": {
|
||||||
|
"https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
|
||||||
|
},
|
||||||
|
"depthDistribution": {"0": 1, "1": 3},
|
||||||
|
"robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
|
||||||
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
|
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
|
||||||
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
|
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
|
||||||
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
|
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
|
||||||
|
- **pages** (object map<string,PageMeta>): Per-page metrics.
|
||||||
|
- **title** (string): The page `<title>` text.
|
||||||
|
- **responseTimeMs** (number): Time to fetch the document.
|
||||||
|
- **contentLength** (number): Size of the fetched body in bytes (best effort).
|
||||||
|
- **depth** (number): Crawl depth from the start URL.
|
||||||
|
- **depthDistribution** (object map<number,number>): Count of pages by depth.
|
||||||
|
- **robots** (object): robots.txt summary.
|
||||||
|
- **present** (boolean): True if `robots.txt` exists and returned 200.
|
||||||
|
- **fetchedAt** (string, RFC3339; optional): Fetch time when present.
|
||||||
|
|
||||||
### Notes
|
### Notes
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue