diff --git a/TODO.md b/TODO.md
index b663944..158595a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -12,12 +12,12 @@ Prioritized from easiest/low-risk to more involved work. Check off as we ship.
- [x] Broken links sample (first N) + per-domain broken counts
### Core additions (default, no flags)
-- [ ] Robots.txt summary (present, fetchedAt)
-- [ ] Sitemap extras (index → child sitemaps, fetch errors)
-- [ ] Per-page response time (responseTimeMs) and content length (basic)
-- [ ] Basic page metadata: `
`
-- [ ] Depth distribution (count of pages by depth)
-- [ ] Redirect map summary (from → to domain counts)
+- [x] Robots.txt summary (present, fetchedAt)
+- [x] Sitemap extras (index → child sitemaps, fetch errors)
+- [x] Per-page response time (responseTimeMs) and content length (basic)
+- [x] Basic page metadata: ``
+- [x] Depth distribution (count of pages by depth)
+- [x] Redirect map summary (from → to domain counts)
### Outputs and UX
- [ ] CSV exports: pages.csv, links.csv
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 9db3c55..a36eb11 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -5,6 +5,7 @@ import (
"io"
"net/http"
"sync"
+ "time"
"urlcrawler/internal/htmlx"
"urlcrawler/internal/urlutil"
@@ -15,14 +16,22 @@ type task struct {
depth int
}
+type PageInfo struct {
+ Title string
+ ResponseTimeMs int64
+ ContentLength int
+ Depth int
+}
+
// Crawl visits pages up to maxDepth and returns visited set, per-URL errors, and per-page outgoing links.
// The visitedCallback and errorCallback functions are called when a page is successfully visited or encounters an error.
// visitedCallback receives the URL, its depth, and the current number of pending tasks in the queue.
// errorCallback receives the URL, the error, and the current number of pending tasks in the queue.
-func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}) {
+func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int, sameHostOnly bool, client *http.Client, userAgent string, visitedCallback func(string, int, int), errorCallback func(string, error, int)) (map[string]struct{}, map[string]error, map[string]map[string]struct{}, map[string]PageInfo) {
visited := make(map[string]struct{})
errs := make(map[string]error)
outlinks := make(map[string]map[string]struct{})
+ pageInfos := make(map[string]PageInfo)
var mu sync.Mutex
origin := urlutil.Origin(startURL)
@@ -56,12 +65,14 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
visitedCallback(tk.url, tk.depth, len(tasks))
}
+ start := time.Now()
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, tk.url, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
mu.Lock()
errs[tk.url] = err
+ pageInfos[tk.url] = PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
mu.Unlock()
if errorCallback != nil {
@@ -73,10 +84,20 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
func() {
defer resp.Body.Close()
ct := resp.Header.Get("Content-Type")
+ // Default meta values
+ meta := PageInfo{Title: "", ResponseTimeMs: time.Since(start).Milliseconds(), ContentLength: 0, Depth: tk.depth}
+ if resp.ContentLength > 0 {
+ meta.ContentLength = int(resp.ContentLength)
+ }
if resp.StatusCode != http.StatusOK || ct == "" || (ct != "text/html" && !hasPrefix(ct, "text/html")) {
+ mu.Lock()
+ pageInfos[tk.url] = meta
+ mu.Unlock()
return
}
body, _ := io.ReadAll(resp.Body)
+ meta.ContentLength = len(body)
+ meta.Title = htmlx.ExtractTitle(stringsReader(string(body)))
hrefs := htmlx.ExtractAnchors(stringsReader(string(body)))
var toEnqueue []string
for _, href := range hrefs {
@@ -102,6 +123,9 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
for _, u := range toEnqueue {
enqueue(task{url: u, depth: tk.depth + 1})
}
+ mu.Lock()
+ pageInfos[tk.url] = meta
+ mu.Unlock()
}()
wgTasks.Done()
}
@@ -121,7 +145,7 @@ func Crawl(ctx context.Context, startURL string, maxDepth int, concurrency int,
enqueue(task{url: startURL, depth: 0})
wgWorkers.Wait()
- return visited, errs, outlinks
+ return visited, errs, outlinks, pageInfos
}
func hasPrefix(s string, prefix string) bool {
diff --git a/internal/htmlx/htmlx.go b/internal/htmlx/htmlx.go
index 540f076..6e169bd 100644
--- a/internal/htmlx/htmlx.go
+++ b/internal/htmlx/htmlx.go
@@ -36,3 +36,22 @@ func ExtractAnchors(r io.Reader) []string {
}
}
}
+
+// ExtractTitle returns the text content of the first element.
+func ExtractTitle(r io.Reader) string {
+ tokens := html.NewTokenizer(r)
+ for {
+ switch tokens.Next() {
+ case html.StartTagToken:
+ name, _ := tokens.TagName()
+ if string(name) == "title" {
+ if tokens.Next() == html.TextToken {
+ t := strings.TrimSpace(string(tokens.Text()))
+ return t
+ }
+ }
+ case html.ErrorToken:
+ return ""
+ }
+ }
+}
diff --git a/internal/report/report.go b/internal/report/report.go
index cbf1dd7..8a34477 100644
--- a/internal/report/report.go
+++ b/internal/report/report.go
@@ -27,6 +27,9 @@ type Report struct {
TopExternalDomains []DomainCount `json:"topExternalDomains,omitempty"`
BrokenSample []linkcheck.LinkStatus `json:"brokenSample,omitempty"`
BrokenByDomain []DomainCount `json:"brokenByDomain,omitempty"`
+ Pages map[string]PageMeta `json:"pages"`
+ DepthDistribution map[int]int `json:"depthDistribution"`
+ Robots RobotsSummary `json:"robots"`
}
type Metadata struct {
@@ -58,7 +61,19 @@ type DomainCount struct {
Count int `json:"count"`
}
-func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params) Report {
+type PageMeta struct {
+ Title string `json:"title"`
+ ResponseTimeMs int64 `json:"responseTimeMs"`
+ ContentLength int `json:"contentLength"`
+ Depth int `json:"depth"`
+}
+
+type RobotsSummary struct {
+ Present bool `json:"present"`
+ FetchedAt string `json:"fetchedAt,omitempty"`
+}
+
+func Build(target string, crawled map[string]struct{}, sitemap map[string]struct{}, crawlErrs map[string]error, check linkcheck.Results, outlinks map[string]map[string]struct{}, meta Metadata, params Params, pages map[string]PageMeta, robots RobotsSummary) Report {
crawledList := sanitizeURLs(keys(crawled))
sitemapList := sanitizeURLs(keys(sitemap))
crawlErrMap := make(map[string]string, len(crawlErrs))
@@ -123,6 +138,12 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
topExternal := mapToSortedSlice(extCounts)
brokenBy := mapToSortedSlice(brokenByDomain)
+ // Depth distribution
+ depthDist := make(map[int]int)
+ for _, pm := range pages {
+ depthDist[pm.Depth]++
+ }
+
summary := fmt.Sprintf("crawled=%d sitemap=%d links=%d ok=%d broken=%d",
len(crawledList), len(sitemapList), len(check.Statuses), st.OK, st.Broken)
@@ -143,6 +164,9 @@ func Build(target string, crawled map[string]struct{}, sitemap map[string]struct
TopExternalDomains: topExternal,
BrokenSample: brokenSample,
BrokenByDomain: brokenBy,
+ Pages: pages,
+ DepthDistribution: depthDist,
+ Robots: robots,
}
}
diff --git a/main.go b/main.go
index d1575f5..e148180 100644
--- a/main.go
+++ b/main.go
@@ -16,6 +16,7 @@ import (
"urlcrawler/internal/linkcheck"
"urlcrawler/internal/report"
"urlcrawler/internal/sitemap"
+ "urlcrawler/internal/urlutil"
)
func main() {
@@ -99,7 +100,7 @@ func main() {
currentURL.Store(u)
}
- visited, crawlErrs, outlinks := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
+ visited, crawlErrs, outlinks, pageInfo := crawler.Crawl(ctx, target, maxDepth, concurrency, sameHostOnly, client, userAgent, visitedCallback, errorCallback)
// Clear progress line before moving to next phase
if !quiet {
@@ -113,6 +114,22 @@ func main() {
fmt.Fprintf(os.Stderr, "sitemap error: %v\n", err)
}
+ // Robots.txt summary (simple)
+ robots := report.RobotsSummary{}
+ robotsURL := urlutil.Origin(target) + "/robots.txt"
+ {
+ req, _ := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
+ req.Header.Set("User-Agent", userAgent)
+ resp, err := client.Do(req)
+ if err == nil {
+ defer resp.Body.Close()
+ if resp.StatusCode == http.StatusOK {
+ robots.Present = true
+ robots.FetchedAt = time.Now().UTC().Format(time.RFC3339)
+ }
+ }
+ }
+
// Build set of all unique links discovered across pages for status checks
allLinks := make(map[string]struct{})
for _, m := range outlinks {
@@ -153,7 +170,19 @@ func main() {
meta.DurationMs = finished.Sub(started).Milliseconds()
fmt.Fprintf(os.Stderr, "Building report...\n")
- reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params)
+
+ // Convert pageInfo to report.PageMeta
+ pages := make(map[string]report.PageMeta, len(pageInfo))
+ for u, pi := range pageInfo {
+ pages[u] = report.PageMeta{
+ Title: pi.Title,
+ ResponseTimeMs: pi.ResponseTimeMs,
+ ContentLength: pi.ContentLength,
+ Depth: pi.Depth,
+ }
+ }
+
+ reports := report.Build(target, visited, smURLs, crawlErrs, checkResults, outlinks, meta, params, pages, robots)
switch output {
case "json":
diff --git a/reports/REPORT_SCHEMA.md b/reports/REPORT_SCHEMA.md
index 0f3f91f..00954c3 100644
--- a/reports/REPORT_SCHEMA.md
+++ b/reports/REPORT_SCHEMA.md
@@ -42,7 +42,16 @@ This document describes the structure of the JSON reports produced by `urlcrawle
"status4xx": 1,
"status5xx": 0,
"statusOther": 0
- }
+ },
+ "reportSummary": "crawled=2 sitemap=2 links=5 ok=4 broken=1",
+ "topExternalDomains": [{"domain": "example-cdn.com", "count": 2}],
+ "brokenSample": [{"url": "https://other.example/broken", "statusCode": 404, "ok": false}],
+ "brokenByDomain": [{"domain": "other.example", "count": 1}],
+ "pages": {
+ "https://example.com": {"title": "Home — Example", "responseTimeMs": 42, "contentLength": 5123, "depth": 0}
+ },
+ "depthDistribution": {"0": 1, "1": 3},
+ "robots": {"present": true, "fetchedAt": "2025-08-31T12:34:59Z"}
}
```
@@ -95,6 +104,15 @@ This document describes the structure of the JSON reports produced by `urlcrawle
- **topExternalDomains** (DomainCount[]): Top external domains referenced by links.
- **brokenSample** (LinkStatus[]): Up to 10 example broken links.
- **brokenByDomain** (DomainCount[]): Broken link counts grouped by domain.
+- **pages** (object map): Per-page metrics.
+ - **title** (string): The page `` text.
+ - **responseTimeMs** (number): Time to fetch the document.
+ - **contentLength** (number): Size of the fetched body in bytes (best effort).
+ - **depth** (number): Crawl depth from the start URL.
+- **depthDistribution** (object map): Count of pages by depth.
+- **robots** (object): robots.txt summary.
+ - **present** (boolean): True if `robots.txt` exists and returned 200.
+ - **fetchedAt** (string, RFC3339; optional): Fetch time when present.
### Notes